Background

Open-Meteo maintains an API for historical weather that allows for non-commercial usage of historical weather data maintained by the website.

This file builds on _v001, _v002, and _v003 to run exploratory analysis on some historical weather data.

Functions and Libraries

The exploration process uses tidyverse, ranger, several generic custom functions, and several functions specific to Open Meteo processing. First, tidyverse, ranger, and the generic functions are loaded:

library(tidyverse) # tidyverse functionality is included throughout
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ranger) # predict() does not work on ranger objects unless ranger has been called
## Warning: package 'ranger' was built under R version 4.2.3
source("./Generic_Added_Utility_Functions_202105_v001.R") # Basic functions

Next, specific functions written in _v001 are copied:

# Helper function for reading a partial CSV file
partialCSVRead <- function(loc, firstRow=1L, lastRow=+Inf, col_names=TRUE, ...) {
    
    # FUNCTION arguments
    # loc: file location
    # firstRow: first row that is relevant to the partial file read (whether header line or data line)
    # last Row: last row that is relevant to the partial file read (+Inf means read until last line of file)
    # col_names: the col_names parameter passed to readr::read_csv
    #            TRUE means header=TRUE (get column names from file, read data starting on next line)
    #            FALSE means header=FALSE (auto-generate column names, read data starting on first line)
    #            character vector means use these as column names (read data starting on first line)
    # ...: additional arguments passed to read_csv

    # Read the file and return
    # skip: rows to be skipped are all those prior to firstRow
    # n_max: maximum rows read are lastRow-firstRow, with an additional data row when col_names is not TRUE
    readr::read_csv(loc, 
                    col_names=col_names,
                    skip=firstRow-1, 
                    n_max=lastRow-firstRow+ifelse(isTRUE(col_names), 0, 1), 
                    ...
                    )
    
}


# Get the break points for gaps in a vector (e.g., 0, 3, 5:8, 20 has break points 0, 3, 5, 20 and 0, 3, 8, 30)
vecGaps <- function(x, addElements=c(), sortUnique=TRUE) {
    
    if(length(addElements)>0) x <- c(addElements, x)
    if(isTRUE(sortUnique)) x <- unique(sort(x))
    list("starts"=c(x[is.na(lag(x)) | x-lag(x)>1], +Inf), 
         "ends"=x[is.na(lead(x)) | lead(x)-x>1]
         )
    
}


# Find the break points in a single file
flatFileGaps <- function(loc) {

    which(stringr::str_length(readLines(loc))==0) %>% vecGaps(addElements=0)
    
}


# Read all relevant data as CSV with header
readMultiCSV <- function(loc, col_names=TRUE, ...) {

    gaps <- flatFileGaps(loc)
    
    lapply(seq_along(gaps$ends), 
           FUN=function(x) partialCSVRead(loc, 
                                          firstRow=gaps$ends[x]+1, 
                                          lastRow=gaps$starts[x+1]-1, 
                                          col_names=col_names, 
                                          ...
                                          )
           )
    
}


# Create URL with specified parameters for downloading data from Open Meteo
openMeteoURLCreate <- function(mainURL="https://archive-api.open-meteo.com/v1/archive", 
                               lat=45, 
                               lon=-90, 
                               startDate=paste(year(Sys.Date())-1, "01", "01", sep="-"), 
                               endDate=paste(year(Sys.Date())-1, "12", "31", sep="-"), 
                               hourlyMetrics=NULL, 
                               dailyMetrics=NULL,
                               tz="GMT", 
                               ...
                               ) {
    
    # Create formatted string
    fString <- paste0(mainURL, 
                      "?latitude=", 
                      lat, 
                      "&longitude=", 
                      lon, 
                      "&start_date=", 
                      startDate, 
                      "&end_date=", 
                      endDate
                      )
    if(!is.null(hourlyMetrics)) fString <- paste0(fString, "&hourly=", hourlyMetrics)
    if(!is.null(dailyMetrics)) fString <- paste0(fString, "&daily=", dailyMetrics)
    
    # Return the formatted string
    paste0(fString, "&timezone=", stringr::str_replace(tz, "/", "%2F"), ...)
    
}


# Helper function to simplify entry of parameters for Open Meteo download requests
helperOpenMeteoURL <- function(cityName=NULL,
                               lat=NULL,
                               lon=NULL,
                               hourlyMetrics=NULL,
                               hourlyIndices=NULL,
                               hourlyDesc=tblMetricsHourly,
                               dailyMetrics=NULL,
                               dailyIndices=NULL,
                               dailyDesc=tblMetricsDaily,
                               startDate=NULL, 
                               endDate=NULL, 
                               tz=NULL,
                               ...
                               ) {
    
    # Convert city to lat/lon if lat/lon are NULL
    if(is.null(lat) | is.null(lon)) {
        if(is.null(cityName)) stop("\nMust provide lat/lon or city name available in maps::us.cities\n")
        cityData <- maps::us.cities %>% tibble::as_tibble() %>% filter(name==cityName)
        if(nrow(cityData)!=1) stop("\nMust provide city name that maps uniquely to maps::us.cities$name\n")
        lat <- cityData$lat[1]
        lon <- cityData$long[1]
    }
    
    # Get hourly metrics by index if relevant
    if(is.null(hourlyMetrics) & !is.null(hourlyIndices)) {
        hourlyMetrics <- hourlyDesc %>% slice(hourlyIndices) %>% pull(metric)
        hourlyMetrics <- paste0(hourlyMetrics, collapse=",")
        cat("\nHourly metrics created from indices:", hourlyMetrics, "\n\n")
    }
    
    # Get daily metrics by index if relevant
    if(is.null(dailyMetrics) & !is.null(dailyIndices)) {
        dailyMetrics <- dailyDesc %>% slice(dailyIndices) %>% pull(metric)
        dailyMetrics <- paste0(dailyMetrics, collapse=",")
        cat("\nDaily metrics created from indices:", dailyMetrics, "\n\n")
    }
    
    # Use default values from OpenMeteoURLCreate() for startDate, endDate, and tz if passed as NULL
    if(is.null(startDate)) startDate <- eval(formals(openMeteoURLCreate)$startDate)
    if(is.null(endDate)) endDate <- eval(formals(openMeteoURLCreate)$endDate)
    if(is.null(tz)) tz <- eval(formals(openMeteoURLCreate)$tz)
    
    # Create and return URL
    openMeteoURLCreate(lat=lat,
                       lon=lon, 
                       startDate=startDate, 
                       endDate=endDate, 
                       hourlyMetrics=hourlyMetrics, 
                       dailyMetrics=dailyMetrics, 
                       tz=tz,
                       ...
                       )
    
}


# Read JSON data returned from Open Meteo
readOpenMeteoJSON <- function(js, mapDaily=tblMetricsDaily, mapHourly=tblMetricsHourly) {
    
    # FUNCTION arguments: 
    # js: JSON list returned by download from Open-Meteo
    # mapDaily: mapping file for daily metrics
    # mapHourly: mapping file for hourly metrics
    
    # Get the object and names
    jsObj <- jsonlite::read_json(js, simplifyVector = TRUE)
    nms <- jsObj %>% names()
    cat("\nObjects in JSON include:", paste(nms, collapse=", "), "\n\n")
    
    # Set default objects as NULL
    tblDaily <- NULL
    tblHourly <- NULL
    tblUnitsDaily <- NULL
    tblUnitsHourly <- NULL
    
    # Get daily and hourly as tibble if relevant
    if("daily" %in% nms) tblDaily <- jsObj$daily %>% tibble::as_tibble() %>% omProcessDaily()
    if("hourly" %in% nms) tblHourly <- jsObj$hourly %>% tibble::as_tibble() %>% omProcessHourly()
    
    # Helper function for unit conversions
    helperMetricUnit <- function(x, mapper, desc=NULL) {
        if(is.null(desc)) 
            desc <- as.list(match.call())$x %>% 
                deparse() %>% 
                stringr::str_replace_all(pattern=".*\\$", replacement="")
        x %>% 
            tibble::as_tibble() %>% 
            pivot_longer(cols=everything()) %>% 
            left_join(mapper, by=c("name"="metric")) %>% 
            mutate(value=stringr::str_replace(value, "\u00b0", "deg ")) %>% 
            mutate(metricType=desc) %>% 
            select(metricType, everything())
    }
    
    # Get the unit descriptions
    if("daily_units" %in% nms) tblUnitsDaily <- helperMetricUnit(jsObj$daily_units, mapDaily)
    if("hourly_units" %in% nms) tblUnitsHourly <- helperMetricUnit(jsObj$hourly_units, mapHourly)
    if(is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) tblUnits <- tblUnitsHourly
    else if(!is.null(tblUnitsDaily) & is.null(tblUnitsHourly)) tblUnits <- tblUnitsDaily
    else if(!is.null(tblUnitsDaily) & !is.null(tblUnitsHourly)) 
        tblUnits <- bind_rows(tblUnitsHourly, tblUnitsDaily)
    else tblUnits <- NULL
    
    # Put everything else together
    tblDescription <- jsObj[setdiff(nms, c("hourly", "hourly_units", "daily", "daily_units"))] %>%
        tibble::as_tibble()
    
    # Return the list objects
    list(tblDaily=tblDaily, tblHourly=tblHourly, tblUnits=tblUnits, tblDescription=tblDescription)
    
}


# Return Open meteo metadata in prettified format
prettyOpenMeteoMeta <- function(df, extr="tblDescription") {
    if("list" %in% class(df)) df <- df[[extr]]
    for(name in names(df)) {
        cat("\n", name, ": ", df %>% pull(name), sep="")
    }
    cat("\n\n")
}


# Process Open Meteo daily data
omProcessDaily <- function(tbl, extr="tblDaily") {
    if("list" %in% class(tbl)) tbl <- tbl[[extr]]
    tbl %>% mutate(date=lubridate::ymd(time)) %>% select(date, everything())
}


# Process Open meteo hourly data
omProcessHourly <- function(tbl, extr="tblHourly") {
    if("list" %in% class(tbl)) tbl <- tbl[[extr]]
    tbl %>% 
        mutate(origTime=time, 
               time=lubridate::ymd_hm(time), 
               date=lubridate::date(time), 
               hour=lubridate::hour(time)
               ) %>% 
        select(time, date, hour, everything())
}


# Simple predictive model for categorical variable
simpleOneVarPredict <- function(df, 
                                tgt, 
                                prd, 
                                dfTest=NULL,
                                nPrint=30, 
                                showPlot=TRUE, 
                                returnData=TRUE
                                ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame or tibble with key elements (training data set)
    # tgt: target variable
    # prd: predictor variable
    # dfTest: test dataset for applying predictions
    # nPrint: maximum number of lines of confusion matrix to print
    #         0 means do not print any summary statistics
    # showPlot: boolean, should overlap plot be created and shown?
    
    # Counts of predictor to target variable
    dfPred <- df %>%
        group_by(across(all_of(c(prd, tgt)))) %>%
        summarize(n=n(), .groups="drop") %>%
        arrange(across(all_of(prd)), desc(n)) %>%
        group_by(across(all_of(prd))) %>%
        mutate(correct=row_number()==1, predicted=first(get(tgt))) %>%
        ungroup()

    # Confusion matrix and accuracy
    dfConf <- dfPred %>%
        group_by(across(all_of(c(tgt, "correct")))) %>%
        summarize(n=sum(n), .groups="drop") %>%
        pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
        mutate(n=`TRUE`+`FALSE`, 
               pctCorrect=`TRUE`/n, 
               pctNaive=1/(nrow(.)), 
               lift=pctCorrect/pctNaive-1
               )
    
    # Overall confusion matrix
    dfConfAll <- dfConf %>%
        summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
        mutate(pctCorrect=`TRUE`/n, 
               pctNaive=nMax/n, 
               lift=pctCorrect/pctNaive-1, 
               nBucket=length(unique(dfPred[[prd]]))
               )
    
    # Print confusion matrices
    if(nPrint > 0) {
        cat("\nAccuracy by target subgroup (training data):\n")
        dfConf %>% print(n=nPrint)
        cat("\nOverall Accuracy (training data):\n")
        dfConfAll %>% print(n=nPrint)
    }
    
    # Plot of overlaps
    if(isTRUE(showPlot)) {
        p1 <- dfPred %>%
            group_by(across(c(all_of(tgt), "predicted", "correct"))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            ggplot(aes(x=get(tgt), y=predicted)) + 
            labs(x="Actual", 
                 y="Predicted", 
                 title=paste0("Training data - Actual vs. predicted ", tgt), 
                 subtitle=paste0("(using ", prd, ")")
                 ) + 
            geom_text(aes(label=n)) + 
            geom_tile(aes(fill=correct), alpha=0.25)
        print(p1)
    }
    
    # Create metrics for test dataset if requested
    if(!is.null(dfTest)) {
        # Get maximum category from training data
        mostPredicted <- count(dfPred, predicted, wt=n) %>% slice(1) %>% pull(predicted)
        # Get mapping of metric to prediction
        dfPredict <- dfPred %>% 
            group_by(across(all_of(c(prd, "predicted")))) %>% 
            summarize(n=sum(n), .groups="drop")
        # Create predictions for test data
        dfPredTest <- dfTest %>%
            select(all_of(c(prd, tgt))) %>%
            left_join(select(dfPredict, -n)) %>%
            replace_na(list(predicted=mostPredicted)) %>%
            group_by(across(all_of(c(prd, tgt, "predicted")))) %>%
            summarize(n=n(), .groups="drop") %>%
            mutate(correct=(get(tgt)==predicted))
        # Create confusion statistics for test data
        dfConfTest <- dfPredTest %>%
            group_by(across(all_of(c(tgt, "correct")))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            pivot_wider(id_cols=tgt, names_from=correct, values_from=n, values_fill=0) %>%
            mutate(n=`TRUE`+`FALSE`, 
                   pctCorrect=`TRUE`/n, 
                   pctNaive=1/(nrow(.)), 
                   lift=pctCorrect/pctNaive-1
                   )
        # Overall confusion matrix for test data
        dfConfAllTest <- dfConfTest %>%
            summarize(nMax=max(n), across(c(`FALSE`, `TRUE`, "n"), sum)) %>%
            mutate(pctCorrect=`TRUE`/n, 
                   pctNaive=nMax/n, 
                   lift=pctCorrect/pctNaive-1, 
                   nBucket=length(unique(dfConfTest[[prd]]))
               )
        # Print confusion matrices
        if(nPrint > 0) {
            cat("\nAccuracy by target subgroup (testing data):\n")
            dfConfTest %>% print(n=nPrint)
            cat("\nOverall Accuracy (testing data):\n")
            dfConfAllTest %>% print(n=nPrint)
            }
    } else {
        dfPredTest <- NULL
        dfConfTest <- NULL
        dfConfAllTest <- NULL
        
    }
    
    # Return data if requested
    if(isTRUE(returnData)) list(dfPred=dfPred, 
                                dfConf=dfConf, 
                                dfConfAll=dfConfAll, 
                                dfPredTest=dfPredTest, 
                                dfConfTest=dfConfTest, 
                                dfConfAllTest=dfConfAllTest
                                )
    
}


# Fit a single predictor to a single categorical variable
simpleOneVarFit <- function(df, 
                            tgt, 
                            prd, 
                            rankType="last", 
                            naMethod=TRUE
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame or tibble with key elements (training data set)
    # tgt: target variable
    # prd: predictor variable
    # rankType: method for breaking ties of same n, passed to base::rank as ties.method=
    # naMethod: method for handling NA in ranks, passed to base::rank as na.last=
    
    # Counts of predictor to target variable, and associated predictions
    df %>%
        group_by(across(all_of(c(prd, tgt)))) %>%
        summarize(n=n(), .groups="drop") %>%
        arrange(across(all_of(prd)), desc(n), across(all_of(tgt))) %>%
        group_by(across(all_of(prd))) %>%
        mutate(rankN=n()+1-rank(n, ties.method=rankType, na.last=naMethod)) %>%
        arrange(across(all_of(prd)), rankN) %>%
        ungroup()

}


# Create categorical predictions mapper
simpleOneVarMapper <- function(df, tgt, prd) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame or tibble from SimpleOneVarFit()
    # tgt: target variable
    # prd: predictor variable
    
    # Get the most common actual results
    dfCommon <- df %>% count(across(all_of(tgt)), wt=n, sort=TRUE)
    
    # Get the predictions
    dfPredictor <- df %>%
        group_by(across(all_of(prd))) %>%
        filter(row_number()==1) %>%
        select(all_of(c(prd, tgt))) %>%
        ungroup()
    
    list(dfPredictor=dfPredictor, dfCommon=dfCommon)
    
}


# Map the categorical predictions to unseen data
simpleOneVarApplyMapper <- function(df, 
                                    tgt,
                                    prd, 
                                    mapper, 
                                    mapperDF="dfPredictor", 
                                    mapperDefault="dfCommon",
                                    prdName="predicted"
                                    ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing prd for predicting tgt
    # tgt: target variable in df
    # prd: predictor variable in df
    # mapper: mapping list from sinpleOneVarMapper()
    # mapperDF: element that can be used to merge mappings
    # mapperDefault: element that can be used for NA resulting from merging mapperDF
    # prdName: name for the prediction variable
    
    # Extract the mapper and default value
    vecRename <- c(prdName) %>% purrr::set_names(tgt)
    dfMap <- mapper[[mapperDF]] %>% select(all_of(c(prd, tgt))) %>% colRenamer(vecRename=vecRename)
    chrDefault <- mapper[[mapperDefault]] %>% slice(1) %>% pull(tgt)
    
    # Merge mappings to df
    df %>%
        left_join(dfMap, by=prd) %>%
        replace_na(list("predicted"=chrDefault))
    
}


# Create confusion matrix data for categorical predictions
simpleOneVarConfusionData <- function(df, 
                                      tgtOrig,
                                      tgtPred, 
                                      otherVars=c(),
                                      weightBy="n"
                                      ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame from simpleOneVarApplyMapper()
    # tgtOrig: original target variable name in df
    # tgtPred: predicted target variable name in df
    # otherVars: other variables to be kept (will be grouping variables)
    # weightBy: weighting variable for counts in df (NULL means count each row of df as 1)
    
    # Confusion matrix data creation
    df %>%
        group_by(across(all_of(c(tgtOrig, tgtPred, otherVars)))) %>%
        summarize(n=if(!is.null(weightBy)) sum(get(weightBy)) else n(), .groups="drop") %>%
        mutate(correct=get(tgtOrig)==get(tgtPred))
    
}


# Print and plot confusion matrix for categorical predictions
simpleOneVarConfusionReport <- function(df, 
                                        tgtOrig,
                                        tgtPred, 
                                        otherVars=c(), 
                                        printConf=TRUE,
                                        printConfOrig=printConf, 
                                        printConfPred=printConf,
                                        printConfOverall=printConf, 
                                        plotConf=TRUE, 
                                        plotDesc="",
                                        nBucket=NA, 
                                        predictorVarName="", 
                                        returnData=FALSE
                                        ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame from simpleOneVarConfusionData()
    # tgtOrig: original target variable name in df
    # tgtPred: predicted target variable name in df
    # otherVars: other variables to be kept (will be grouping variables) - NOT IMPLEMENTED
    # printConf: boolean, should confusion matrix data be printed? Applies to all three
    # printConfOrig: boolean, should confusion data be printed based on original target variable?
    # printConfPred: boolean, should confusion data be printed based on predicted target variable?
    # printConfOverall: boolean, should overall confusion data be printed?
    # plotConf: boolean, should confusion overlap data be plotted?
    # plotDesc: descriptive label to be included in front of plot title
    # nBucket: number of buckets used for prediction (pass from previous data)
    # predictorVarName: variable name to be included in chart description
    # returnData: boolean, should the confusion matrices be returned?
    
    # Confusion data based on original target variable
    if(isTRUE(printConfOrig) | isTRUE(returnData)) {
        dfConfOrig <- df %>%
            group_by(across(all_of(c(tgtOrig)))) %>%
            summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
            mutate(pctRight=right/n, pctNaive=n/(sum(n)), lift=pctRight/pctNaive-1)
    }

    # Confusion data based on predicted target variable
    if(isTRUE(printConfPred) | isTRUE(returnData)) {
        dfConfPred <- df %>%
            group_by(across(all_of(c(tgtPred)))) %>%
            summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
            mutate(pctRight=right/n)
    }

    # Overall confusion data
    if(isTRUE(printConfOverall) | isTRUE(returnData)) {
        maxNaive <- df %>%
            group_by(across(all_of(tgtOrig))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            arrange(desc(n)) %>%
            slice(1) %>%
            pull(n)
        dfConfOverall <- df %>%
            summarize(right=sum(n*correct), wrong=sum(n)-right, n=sum(n), .groups="drop") %>%
            mutate(maxN=maxNaive, pctRight=right/n, pctNaive=maxN/n, lift=pctRight/pctNaive-1, nBucket=nBucket)
    }
    
    # Confusion report based on original target variable
    if(isTRUE(printConfOrig)) {
        cat("\nConfusion data based on original target variable:", tgtOrig, "\n")
        dfConfOrig %>%
            print(n=50)
    }

    # Confusion report based on predicted target variable
    if(isTRUE(printConfPred)) {
        cat("\nConfusion data based on predicted target variable:", tgtPred, "\n")
        dfConfPred %>%
            print(n=50)
    }
    
    # Overall confusion matrix
    if(isTRUE(printConfOverall)) {
        cat("\nOverall confusion matrix\n")
        dfConfOverall %>%
            print(n=50)
    }
    
    # Plot of overlaps
    if(isTRUE(plotConf)) {
        p1 <- df %>%
            group_by(across(all_of(c(tgtOrig, tgtPred, "correct")))) %>%
            summarize(n=sum(n), .groups="drop") %>%
            ggplot(aes(x=get(tgtOrig), y=get(tgtPred))) + 
            labs(x="Actual", 
                 y="Predicted", 
                 title=paste0(plotDesc, "Actual vs. predicted ", tgtOrig), 
                 subtitle=paste0("(using ", predictorVarName, ")")
                 ) + 
            geom_text(aes(label=n)) + 
            geom_tile(aes(fill=correct), alpha=0.25)
        print(p1)
    }
    
    # Return data if requested
    if(isTRUE(returnData)) list(dfConfOrig=dfConfOrig, dfConfPred=dfConfPred, dfConfOverall=dfConfOverall)
    
}


# Process for chaining predictor, applier, and confusion matrix for categorical variables
simpleOneVarChain <- function(df,
                              tgt,
                              prd,
                              mapper=NULL, 
                              rankType="last", 
                              naMethod=TRUE, 
                              printReport=TRUE, 
                              plotDesc="",
                              returnData=TRUE, 
                              includeConfData=FALSE
                              ) {

    # FUNCTION ARGUMENTS:
    # df: data frame or tibble with key elements (training or testing data set)
    # tgt: target variable
    # prd: predictor variable
    # mapper: mapping file to be applied for predictions (NULL means create from simpleOneVarApply())
    # rankType: method for breaking ties of same n, passed to base::rank as ties.method=
    # naMethod: method for handling NA in ranks, passed to base::rank as na.last=    
    # printReport: boolean, should the confusion report data and plot be printed?
    # plotDesc: descriptive label to be included in front of plot title
    # returnData: boolean, should data elements be returned?
    # includeConfData: boolean, should confusion data be returned?
    
    # Create the summary of predictor-target-n
    dfFit <- simpleOneVarFit(df, tgt=tgt, prd=prd, rankType=rankType, naMethod=naMethod)     

    # Create the mapper if it does not already exist
    if(is.null(mapper)) mapper <- simpleOneVarMapper(dfFit, tgt=tgt, prd=prd)
    
    # Apply mapper to data
    dfApplied <- simpleOneVarApplyMapper(dfFit, tgt=tgt, prd=prd, mapper=mapper)

    # Create confusion data
    dfConfusion <- simpleOneVarConfusionData(dfApplied, tgtOrig=tgt, tgtPred="predicted")
    
    # Create confusion report if requested
    if(isTRUE(printReport) | isTRUE(includeConfData)) {
        dfConfReport <- simpleOneVarConfusionReport(df=dfConfusion, 
                                                    tgtOrig=tgt, 
                                                    tgtPred="predicted", 
                                                    nBucket=length(unique(dfApplied[[prd]])), 
                                                    predictorVarName=prd, 
                                                    printConf=printReport, 
                                                    plotConf=printReport,
                                                    plotDesc=plotDesc,
                                                    returnData=includeConfData
                                                    )
    }
    
    # Return data if requested
    if(isTRUE(returnData)) {
        ret <- list(dfFit=dfFit, mapper=mapper, dfApplied=dfApplied, dfConfusion=dfConfusion)
        if(isTRUE(includeConfData)) ret<-c(ret, list(dfConfData=dfConfReport))
        ret
    }
    
}


# Adds a train-test component for single variable predictions
simpleOneVarTrainTest <- function(dfTrain,
                                  dfTest,
                                  tgt,
                                  prd,
                                  rankType="last", 
                                  naMethod=TRUE, 
                                  printReport=FALSE, 
                                  includeConfData=TRUE, 
                                  returnData=TRUE
                              ) {

    # FUNCTION ARGUMENTS:
    # dfTrain: data frame or tibble with key elements (training data set)
    # dfTest: data frame or tibble with key elements (testing data set)
    # tgt: target variable
    # prd: predictor variable
    # rankType: method for breaking ties of same n, passed to base::rank as ties.method=
    # naMethod: method for handling NA in ranks, passed to base::rank as na.last=    
    # printReport: boolean, should the confusion report data and plot be printed?
    # includeConfData: boolean, should confusion data be returned?
    # returnData: boolean, should data elements be returned?
    
    # Fit the training data
    tmpTrain <- simpleOneVarChain(df=dfTrain, 
                                  tgt=tgt, 
                                  prd=prd,
                                  rankType=rankType,
                                  naMethod=naMethod,
                                  printReport=printReport,
                                  plotDesc="Training data: ",
                                  returnData=TRUE,
                                  includeConfData=includeConfData
                                  )
    
    # Fit the testing data
    tmpTest <- simpleOneVarChain(df=dfTest, 
                                 tgt=tgt, 
                                 prd=prd,
                                 mapper=tmpTrain$mapper,
                                 rankType=rankType,
                                 naMethod=naMethod,
                                 printReport=printReport,
                                 plotDesc="Testing data: ",
                                 returnData=TRUE,
                                 includeConfData=includeConfData
                                 )
    
    # Return data if requested
    if(isTRUE(returnData)) list(tmpTrain=tmpTrain, tmpTest=tmpTest)
    
}


# Plot the means by cluster and variable for a k-means object
plotClusterMeans <- function(km, nrow=NULL, ncol=NULL, scales="fixed") {

    # FUNCTION ARGUMENTS
    # km: object returned by stats::kmeans(...)
    # nrow: number of rows for faceting (NULL means default)
    # ncol: number of columns for faceting (NULL means default)
    # scales: passed to facet_wrap as scales=scales
    
    # Assess clustering by dimension
    p1 <- km$centers %>%
        tibble::as_tibble() %>%
        mutate(cluster=row_number()) %>%
        pivot_longer(cols=-c(cluster)) %>%
        ggplot(aes(x=fct_reorder(name, 
                                 value, 
                                 .fun=function(a) ifelse(length(a)==2, a[2]-a[1], diff(range(a)))
                                 ), 
                   y=value
                   )
               ) + 
        geom_point(aes(color=factor(cluster))) + 
        scale_color_discrete("Cluster") + 
        facet_wrap(~factor(cluster), nrow=nrow, ncol=ncol, scales=scales) +
        labs(title=paste0("Cluster means (kmeans, centers=", nrow(km$centers), ")"), 
             x="Metric", 
             y="Cluster mean"
             ) + 
        geom_hline(yintercept=median(km$centers), lty=2) +
        coord_flip()
    print(p1)
    
}


# Plot percentage by cluster
plotClusterPct <- function(df, km, keyVars, nRowFacet=1, printPlot=TRUE) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame initially passed to stats::kmeans(...)
    # km: object returned by stats::kmeans(...)
    # keyVars: character vector of length 1 (y-only, x will be cl) or length 2 (x, y, cl will facet)
    # nRowFacet: number of rows for facetting (only relevant if length(keyVars) is 2)
    # printPlot: boolean, should plot be printed? (if not true, plot will be returned)
    
    # Check length of keyVars
    if(!(length(keyVars) %in% c(1, 2))) stop("\nArgument keyVars must be length-1 or length-2\n")
    
    p1 <- df %>%
        mutate(cl=factor(km$cluster)) %>%
        group_by(across(c(all_of(keyVars), "cl"))) %>%
        summarize(n=n(), .groups="drop") %>%
        group_by(across(all_of(keyVars))) %>%
        mutate(pct=n/sum(n)) %>%
        ungroup() %>%
        ggplot() + 
        scale_fill_continuous(low="white", high="green") + 
        labs(title=paste0("Percentage by cluster (kmeans with ", nrow(km$centers), " centers)"), 
             x=ifelse(length(keyVars)==1, "Cluster", keyVars[1]), 
             y=ifelse(length(keyVars)==1, keyVars[1], keyVars[2])
             )
    if(length(keyVars)==1) p1 <- p1 + geom_tile(aes(fill=pct, x=cl, y=get(keyVars[1])))
    if(length(keyVars)==2) {
        p1 <- p1 + 
            geom_tile(aes(fill=pct, x=get(keyVars[1]), y=get(keyVars[2]))) + 
            facet_wrap(~cl, nrow=nRowFacet)
    }
    
    if(isTRUE(printPlot)) print(p1)
    else return(p1)
    
}


# Run k-means (or use passed k-means object) and plot centers and percentages of observations
runKMeans <- function(df, 
                      km=NULL,
                      vars=NULL, 
                      centers=2, 
                      nStart=1L, 
                      iter.max=10L, 
                      seed=NULL, 
                      plotMeans=FALSE,
                      nrowMeans=NULL,
                      plotPct=NULL, 
                      nrowPct=1, 
                      returnKM=is.null(km)
                      ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame for clustering
    # km: k-means object (will shut off k-means processing and run as plot-only)
    # vars: variables to be used for clustering (NULL means everything in df)
    # centers: number of centers
    # nStart: passed to kmeans
    # iter.max: passed to kmeans
    # seed: seed to be set (if NULL, no seed is set)
    # plotMeans: boolean, plot variable means by cluster?
    # nrowMeans: argument passed as nrow for faceting rows in plotClusterMeans() - NULL is default ggplot2
    # plotPct: list of character vectors to be passed sequentially as keyVars to plotClusterPct()
    #          NULL means do not run
    #          pctByCluster=list(c("var1"), c("var2", "var3")) will run plotting twice
    # nrowPct: argument for faceting number of rows in plotClusterPct()
    # returnKM: boolean, should the k-means object be returned?
    
    # Set seed if requested
    if(!is.null(seed)) set.seed(seed)
    
    # Get the variable names if passed as NULL
    if(is.null(vars)) vars <- names(df)
    
    # Run the k-means process if the object has not been passed
    if(is.null(km)) {
        km <- df %>%
            select(all_of(vars)) %>% 
            kmeans(centers=centers, iter.max=iter.max, nstart=nStart)
    }

    # Assess clustering by dimension if requested
    if(isTRUE(plotMeans)) plotClusterMeans(km, nrow=nrowMeans)
    if(!is.null((plotPct))) 
        for(ctr in 1:length(plotPct)) 
            plotClusterPct(df=df, km=km, keyVars=plotPct[[ctr]], nRowFacet=nrowPct)
    
    # Return the k-means object
    if(isTRUE(returnKM)) return(km)
    
}


# Assign points to closest center of a passed k-means object
assignKMeans <- function(km, df, returnAllDistanceData=FALSE) {
    
    # FUNCTION ARGUMENTS:
    # km: a k-means object
    # df: data frame or tibble
    # returnAllDistanceData: boolean, should the distance data and clusters be returned?
    #                        TRUE returns a data frame with distances as V1, V2, ..., and cluster as cl
    #                        FALSE returns a vector of cluster assignments as integers
    
    # Select columns from df to match km
    df <- df %>% select(all_of(colnames(km$centers)))
    if(!all.equal(names(df), colnames(km$centers))) stop("\nName mismatch in clustering and frame\n")
    
    # Create the distances and find clusters
    distClust <- sapply(seq_len(nrow(km$centers)), 
                        FUN=function(x) sqrt(rowSums(sweep(as.matrix(df), 
                                                           2, 
                                                           t(as.matrix(km$centers[x,,drop=FALSE]))
                                                           )**2
                                                     )
                                             )
                        ) %>% 
        as.data.frame() %>% 
        tibble::as_tibble() %>% 
        mutate(cl=apply(., 1, which.min))
    
    # Return the proper file
    if(isTRUE(returnAllDistanceData)) return(distClust)
    else return(distClust$cl)
    
}

As well, specific functions from _v002 and _v003 are copied:

runSimpleRF <- function(df, yVar, xVars=NULL, ...) {

    # FUNCTION ARGUMENTS:
    # df: data frame containing observations
    # yVar: variable to be predicted (numeric for regression, categorical for classification)
    # xVars: predictor variables (NULL means everything in df except for yVar)
    # ...: other arguments passed to ranger::ranger
    
    # Create xVars if passed as NULL
    if(is.null(xVars)) xVars <- setdiff(names(df), yVar)
    
    # Simple random forest model
    ranger::ranger(as.formula(paste0(yVar, "~", paste0(xVars, collapse="+"))), 
                   data=df[, c(yVar, xVars)], 
                   ...
                   )
    
}

plotRFImportance <- function(rf, 
                             impName="variable.importance", 
                             divBy=1000, 
                             plotTitle=NULL, 
                             plotData=TRUE, 
                             returnData=!isTRUE(plotData)
                             ) {
    
    # FUNCTION ARGUMENTS:
    # rf: output list from random forest with an element for importance
    # impName: name of the element to extract from rf
    # divBy: divisor for the importance variable
    # plotTitle: title for plot (NULL means use default)
    # plotData: boolean, should the importance plot be created and printed?
    # returnData: boolean, should the processed data be returned?
    
    # Create title if not provided
    if(is.null(plotTitle)) plotTitle <- "Importance for simple random forest"

    # Create y-axis label
    yAxisLabel="Variable Importance"
    if(!isTRUE(all.equal(divBy, 1))) yAxisLabel <- paste0(yAxisLabel, " (", divBy, "s)")
    
    # Create variable importance
    df <- rf[[impName]] %>% 
        as.data.frame() %>% 
        purrr::set_names("imp") %>% 
        rownames_to_column("metric") %>% 
        tibble::as_tibble() 
    
    # Create and print plot if requested
    if(isTRUE(plotData)) {
        p1 <- df %>%
            ggplot(aes(x=fct_reorder(metric, imp), y=imp/divBy)) + 
            geom_col(fill="lightblue") + 
            labs(x=NULL, y=yAxisLabel, title=plotTitle) +
            coord_flip()
        print(p1)
    }
    
    # Return data if requested
    if(isTRUE(returnData)) return(df)
    
}

predictRF <- function(rf, df, newCol="pred", predsOnly=FALSE) {
    
    # FUNCTION ARGUMENTS:
    # rf: a trained random forest model
    # df: data frame for adding predictions
    # newCol: name for new column to be added to df
    # predsOnly: boolean, should only the vector of predictions be returned?
    #            if FALSE, a column named newCol is added to df, with df returned

    # Performance on holdout data
    preds <- predict(rf, data=df)$predictions
    
    # Return just the predictions if requested otherwise add as final column to df
    if(isTRUE(predsOnly)) return(preds)
    else {
        df[newCol] <- preds
        return(df)
    }
    
}

# Update for continuous variables
reportAccuracy <- function(df, 
                           trueCol, 
                           predCol="pred", 
                           reportAcc=TRUE, 
                           rndReport=2, 
                           useLabel="requested data",
                           returnAcc=!isTRUE(reportAcc), 
                           reportR2=FALSE
                           ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # reportAcc: boolean, should accuracy be reported (printed to output)?
    # rndReport: number of significant digits for reporting (will be converted to percentage first)
    # useLabel: label for data to be used in reporting
    # returnAcc: boolean, should the accuracy be returned 
    #            return value is not converted to percentage, not rounded
    # reportR2: boolean, should accuracy be calculated as R-squared?
    #           (default FALSE measures as categorical)
    
    # Continuous or categorical reporting
    if(isTRUE(reportR2)) {
        tc <- df %>% pull(get(trueCol))
        pc <- df %>% pull(get(predCol))
        mseNull <- mean((tc-mean(tc))**2)
        msePred <- mean((tc-pc)**2)
        r2 <- 1 - msePred/mseNull
        if(isTRUE(reportAcc)) 
            cat("\nR-squared of ", 
                useLabel, 
                " is: ", 
                round(100*r2, rndReport), 
                "% (RMSE ",
                round(sqrt(msePred), 2), 
                " vs. ", 
                round(sqrt(mseNull), 2),
                " null)\n", 
                sep=""
                )
        acc <- c("mseNull"=mseNull, "msePred"=msePred, "r2"=r2)
    } else {
        acc <- mean(df[trueCol]==df[predCol])
        if(isTRUE(reportAcc)) 
            cat("\nAccuracy of ", useLabel, " is: ", round(100*acc, rndReport), "%\n", sep="")    
    }
    
    # Return accuracy statistic if requested
    if(isTRUE(returnAcc)) return(acc)
    
}

# Update for automated rounding
plotConfusion <- function(df, 
                          trueCol, 
                          predCol="pred", 
                          useTitle=NULL,
                          useSub=NULL, 
                          plotCont=FALSE, 
                          rndTo=NULL,
                          rndBucketsAuto=100,
                          nSig=NULL,
                          refXY=FALSE
                          ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame containing actual and predictions
    # trueCol: column containing true value
    # predCol: column containing predicted value
    # useTitle: title to be used for chart (NULL means create from trueCol)
    # useSub: subtitle to be used for chart (NULL means none)
    # plotCont: boolean, should plotting assume continuous variables?
    #           (default FALSE assumes confusion plot for categorical variables)
    # rndTo: every number in x should be rounded to the nearest rndTo
    #        NULL means no rounding (default)
    #        -1L means make an estimate based on data
    # rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
    # nSig: number of significant digits for automatically calculated rounding parameter
    #       (NULL means calculate exactly)
    # refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
    
    # Create title if not supplied
    if(is.null(useTitle)) useTitle <- paste0("Predicting ", trueCol)

    # Function auto-round returns vector as-is when rndTo is NULL and auto-rounds when rndTo is -1L
    df <- df %>%
        mutate(across(all_of(c(trueCol, predCol)), 
                      .fns=function(x) autoRound(x, rndTo=rndTo, rndBucketsAuto=rndBucketsAuto, nSig=nSig)
                      )
               )
    
    # Create base plot (applicable to categorical or continuous variables)
    # Use x as true and y as predicted, for more meaningful geom_smooth() if continuous
    # Flip coordinates if categorical
    p1 <- df %>%
        group_by(across(all_of(c(trueCol, predCol)))) %>%
        summarize(n=n(), .groups="drop") %>%
        ggplot(aes(y=get(predCol), x=get(trueCol))) + 
        labs(y="Predicted", x="Actual", title=useTitle, subtitle=useSub)
        
    # Update plot as appropriate
    if(isTRUE(plotCont)) {
        p1 <- p1 +
            geom_point(aes(size=n), alpha=0.5) + 
            scale_size_continuous("# Obs") +
            geom_smooth(aes(weight=n), method="lm")
        if(isTRUE(refXY)) p1 <- p1 + geom_abline(slope=1, intercept=0, lty=2, color="red")
    } else {
        p1 <- p1 + 
            geom_tile(aes(fill=n)) + 
            geom_text(aes(label=n), size=2.5) +
            coord_flip() +
            scale_fill_continuous("", low="white", high="green")
    }
    
    # Output plot
    print(p1)
    
}

runFullRF <- function(dfTrain, 
                      yVar, 
                      xVars, 
                      dfTest=dfTrain,
                      useLabel="test data",
                      useSub=NULL, 
                      isContVar=FALSE,
                      rndTo=NULL,
                      rndBucketsAuto=100,
                      nSig=NULL,
                      refXY=FALSE,
                      makePlots=TRUE,
                      plotImp=makePlots,
                      plotConf=makePlots,
                      returnData=FALSE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # yVar: dependent variable
    # xVars: column(s) containing independent variables
    # dfTest: test dataset for applying predictions
    # useLabel: label to be used for reporting accuracy
    # useSub: subtitle to be used for confusion chart (NULL means none)
    # isContVar: boolean, is the variable continuous? (default FALSE means categorical)
    # rndTo: every number in x should be rounded to the nearest rndTo
    #        NULL means no rounding (default)
    #        -1L means make an estimate based on data
    # rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
    # nSig: number of significant digits for automatically calculated rounding parameter
    #       (NULL means calculate exactly)    
    # refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
    # makePlots: boolean, should plots be created for variable importance and confusion matrix?
    # plotImp: boolean, should variable importance be plotted? (default is makePlots)
    # plotConf: boolean, should confusion matrix be plotted? (default is makePlots)
    # returnData: boolean, should data be returned?
    # ...: additional parameters to pass to runSimpleRF(), which are then passed to ranger::ranger()

    # 1. Run random forest using impurity for importance
    rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity", ...)

    # 2. Create, and optionally plot, variable importance
    rfImp <- plotRFImportance(rf, plotData=plotImp, returnData=TRUE)

    # 3. Predict on test dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # 4. Report on accuracy (updated for continuous or categorical)
    rfAcc <- reportAccuracy(tstPred, 
                            trueCol=yVar, 
                            rndReport=3, 
                            useLabel=useLabel, 
                            reportR2=isTRUE(isContVar),
                            returnAcc=TRUE
                            )

    # 5. Plot confusion data (updated for continuous vs. categorical) if requested
    if(isTRUE(plotConf)) {
        plotConfusion(tstPred, 
                      trueCol=yVar, 
                      useSub=useSub, 
                      plotCont=isTRUE(isContVar), 
                      rndTo=rndTo, 
                      rndBucketsAuto=rndBucketsAuto,
                      nSig=nSig,
                      refXY=refXY
                      )
    }
    
    #6. Return data if requested
    if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
    
}

runPartialImportanceRF <- function(dfTrain, 
                                   yVar, 
                                   dfTest,
                                   impDB=dfImp,
                                   nImp=+Inf,
                                   otherX=c(),
                                   isContVar=TRUE, 
                                   useLabel=keyLabel, 
                                   useSub=stringr::str_to_sentence(keyLabel), 
                                   rndTo=NULL,
                                   rndBucketsAuto=50,
                                   nSig=NULL,
                                   refXY=FALSE,
                                   makePlots=FALSE, 
                                   returnElem=c("rfImp", "rfAcc")
                                   ) {
    
    # FUNCTION ARGUMENTS
    # dfTrain: training data
    # yVar: y variable in dfTrain
    # dfTest: test data
    # impDB: tibble containing variable importance by dependent variable
    # nImp: use the top nImp variables by variable importance
    # otherX: include these additional x variables
    # isContVar: boolean, is this a continuous variable (regression)? FALSE means classification
    # useLabel: label for description
    # useSub: label for plot
    # rndTo: controls the rounding parameter for plots, passed to runFullRF 
    #        (NULL means no rounding)
    #        -1L means make an estimate based on underlying data
    # rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
    # nSig: number of significant digits for automatically calculated rounding parameter
    #       (NULL means calculate exactly)    
    # refXY: controls the reference line parameter for plots, passed to runFullRF
    # makePlots: boolean, should plots be created?
    # returnElem: character vector of list elements to be returned

    runFullRF(dfTrain=dfTrain, 
              yVar=yVar, 
              xVars=unique(c(impDB %>% filter(n<=nImp, src==yVar) %>% pull(metric), otherX)), 
              dfTest=dfTest, 
              isContVar = isContVar, 
              useLabel=useLabel, 
              useSub=useSub, 
              rndTo=rndTo,
              rndBucketsAuto=rndBucketsAuto,
              nSig=nSig,
              refXY=refXY,
              makePlots=makePlots,
              returnData=TRUE
              )[returnElem]
    
}

autoRound <- function(x, rndTo=-1L, rndBucketsAuto=100, nSig=NULL) {

    # FUNCTION ARGUMENTS
    # x: vector to be rounded
    # rndTo: every number in x should be rounded to the nearest rndTo
    #        NULL means no rounding
    #        -1L means make an estimate based on data (default)
    # rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
    # nSig: number of significant digits for automatically calculated rounding parameter
    #       (NULL means calculate exactly)
    
    # If rndTo is passed as NULL, return x as-is
    if(is.null(rndTo)) return(x)
    
    # If rndTo is passed as -1L, make an estimate for rndTo
    if(isTRUE(all.equal(-1L, rndTo))) {
        # Get the number of unique values in x
        nUq <- length(unique(x))
        # If the number of unique values is no more than 150% of rndToBucketsAuto, return as-is
        if(nUq <= 1.5*rndBucketsAuto) return(x)
        # Otherwise, calculate a value for rndTo
        rndTo <- diff(range(x)) / rndBucketsAuto
        # Truncate to requested number of significant digits
        if(!is.null(nSig)) rndTo <- signif(rndTo, digits=nSig)
    }
    
    # Return the rounded vector if it was not already returned
    return(round(x/rndTo)*rndTo)

}


autoPartialImportance <- function(dfTrain, 
                                  dfTest, 
                                  yVar, 
                                  isContVar,
                                  impDB=dfImp,
                                  impNums=c(1:10, 16, 25, nrow(filter(dfImp, src==yVar)))
                                  ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # dfTest: test (holdout) data
    # yVar: dependent variable
    # isContVar: boolean, is this a contnuous variable (R-2) or categorical variable (accuracy)?
    # impDB: tibble containing sorted variable importances by predictor
    # impNums: vector of number of variables to run (each element in vector run)
    
    # Accuracy on holdout data
    tblRPI <- tibble::tibble(nImp=impNums, 
                             rfAcc=sapply(impNums, 
                                          FUN=function(x) {y <- runPartialImportanceRF(dfTrain=dfTrain, 
                                                                                       yVar=yVar, 
                                                                                       dfTest=dfTest, 
                                                                                       isContVar=isContVar, 
                                                                                       impDB=impDB, 
                                                                                       nImp=x, 
                                                                                       makePlots=FALSE
                                                                                       )[["rfAcc"]]
                                                           if(isTRUE(isContVar)) y <- y["r2"]
                                                           y
                                                           }
                                          )
                             )
    print(tblRPI)

    # Plot of holdout accuracy/r-squared vs. number of variables
    # if(isTRUE(isContVar)) tblRPI <- tblRPI %>% mutate(rfAcc=r2)
    if(isTRUE(isContVar)) prtDesc <- "R-squared" else prtDesc <- "Accuracy"
    p1 <- tblRPI %>%
        select(nImp, rfAcc) %>%
        bind_rows(tibble::tibble(nImp=0, rfAcc=0)) %>%
        ggplot(aes(x=nImp, y=rfAcc)) + 
        geom_line() + 
        geom_point() + 
        labs(title=paste0(prtDesc, " on holdout data vs. number of predictors"), 
             subtitle=paste0("Predicting ", yVar),
             y=paste0(prtDesc, " on holdout data"), 
             x="# Predictors (selected in order of variable importance in full model)"
             ) + 
        lims(y=c(0, 1)) + 
        geom_hline(data=~filter(., rfAcc==max(rfAcc)), aes(yintercept=rfAcc), lty=2)
    print(p1)
    
    return(tblRPI)
    
}


runNextBestPredictor <- function(varsRun, 
                                 xFix, 
                                 yVar, 
                                 isContVar,
                                 dfTrain,
                                 dfTest=dfTrain, 
                                 useLabel="predictions based on training data applied to holdout dataset",
                                 useSub=stringr::str_to_sentence(keyLabel_v3), 
                                 makePlots=FALSE
                                 ) {
    
    # FUNCTION ARGUMENTS:
    # varsRun: variables to be run as potential next-best predictors
    # xFix: variables that are already included in every test of next-best
    # yVar: dependent variable of interest
    # isContVar: boolean, is yvar continuous?
    # dfTrain: training data
    # dfTest: test data
    # useLabel: descriptive label
    # useSub: subtitle description
    # makePlots: boolean, should plots be created for each predictor run?
    
    vecAcc <- sapply(varsRun, FUN=function(x) {
        y <- runFullRF(dfTrain=dfTrain, 
                  yVar=yVar, 
                  xVars=c(xFix, x),
                  dfTest=dfTest, 
                  useLabel=useLabel, 
                  useSub=useSub,
                  isContVar=isContVar,
                  makePlots=makePlots,
                  returnData=TRUE
                  )[["rfAcc"]]
        if(isTRUE(isContVar)) y[["r2"]] else y
        }
        )

    vecAcc %>% 
        as.data.frame() %>% 
        purrr::set_names("rfAcc") %>% 
        rownames_to_column("pred") %>% 
        tibble::tibble() %>%
        arrange(desc(rfAcc)) %>%
        print(n=40)
    
    vecAcc

}


getNextBestVar <- function(x, returnTbl=FALSE, n=if(isTRUE(returnTbl)) +Inf else 1) {
    
    # FUNCTION ARGUMENTS:
    # x: named vector of accuracy or r-squared
    # returnTbl: boolean, if TRUE convert to tibble and return, if FALSE return vector of top-n predictors 
    # n: number of predictrs to return (+Inf will return the full tibble or vector)
    
    tbl <- vecToTibble(x, colNameName="pred") %>%
        arrange(-value) %>%
        slice_head(n=n)
    if(isTRUE(returnTbl)) return(tbl)
    else return(tbl %>% pull(pred))
    
}


newCityPredict <- function(rf, 
                           dfTest, 
                           trueCol, 
                           isContVar=FALSE,
                           reportR2=isTRUE(isContVar), 
                           plotCont=isTRUE(isContVar), 
                           reportAcc=TRUE, 
                           rndReport=2, 
                           useLabel="requested data",
                           useTitle=NULL,
                           useSub=NULL, 
                           rndTo=NULL,
                           rndBucketsAuto=100,
                           nSig=NULL,
                           refXY=FALSE, 
                           returnData=TRUE
                           ) {
    
    # FUNCTION ARGUMENTS:
    # rf: The existing "ranger" model OR a list containing element "rf" that has the existing "ranger" model
    # dfTest: the new dataset for predictions
    # trueCol: column containing true value
    # isContVar: boolean, is the variable continuous? (default FALSE means categorical)
    # reportR2: boolean, should accuracy be calculated as R-squared?
    #           (FALSE measures as categorical)
    # plotCont: boolean, should plotting assume continuous variables?
    #           (FALSE assumes confusion plot for categorical variables)
    # reportAcc: boolean, should accuracy be reported (printed to output)?
    # rndReport: number of significant digits for reporting (will be converted to percentage first)
    # useLabel: label for data to be used in reporting
    # useTitle: title to be used for chart (NULL means create from trueCol)
    # useSub: subtitle to be used for chart (NULL means none)
    # rndTo: every number in x should be rounded to the nearest rndTo
    #        NULL means no rounding (default)
    #        -1L means make an estimate based on data
    # rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
    # nSig: number of significant digits for automatically calculated rounding parameter
    #       (NULL means calculate exactly)
    # refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
    # returnData: boolean, should a list be returned containing tstPred and rfAcc?
    
    # Get the ranger data
    if(!("ranger" %in% class(rf))) {
        if(!("rf" %in% names(rf))) {
            stop("\nERROR: rf must be of class 'ranger' OR a list with element 'rf' that is of class 'ranger")
        }
        rf <- rf[["rf"]]
    }
    if(!("ranger" %in% class(rf)))
        stop("\nERROR: rf must be of class 'ranger' OR a list with element 'rf' that is of class 'ranger")
    
    # Predict on new dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # Report on accuracy
    rfAcc <- reportAccuracy(tstPred, 
                            trueCol=trueCol, 
                            reportAcc=reportAcc,
                            rndReport=rndReport, 
                            useLabel=useLabel, 
                            reportR2=reportR2,
                            returnAcc=TRUE
                            )

    # Plot confusion data
    plotConfusion(tstPred, 
                  trueCol=trueCol, 
                  useTitle=useTitle,
                  useSub=useSub, 
                  plotCont=plotCont, 
                  rndTo=rndTo,
                  rndBucketsAuto=rndBucketsAuto,
                  nSig=nSig,
                  refXY=refXY
                  )
    
    # Return data if requested
    if(isTRUE(returnData)) return(list(tstPred=tstPred, rfAcc=rfAcc))
    
}

Key mapping tables for available metrics are also copied:

hourlyMetrics <- "temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm"
dailyMetrics <- "weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration"

hourlyDescription <- "Air temperature at 2 meters above ground\nRelative humidity at 2 meters above ground\nDew point temperature at 2 meters above ground\nApparent temperature is the perceived feels-like temperature combining wind chill factor, relative humidity and solar radiation\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nAtmospheric air pressure reduced to mean sea level (msl) or pressure at surface. Typically pressure on mean sea level is used in meteorology. Surface pressure gets lower with increasing elevation.\nTotal precipitation (rain, showers, snow) sum of the preceding hour. Data is stored with a 0.1 mm precision. If precipitation data is summed up to monthly sums, there might be small inconsistencies with the total precipitation amount.\nOnly liquid precipitation of the preceding hour including local showers and rain from large scale systems.\nSnowfall amount of the preceding hour in centimeters. For the water equivalent in millimeter, divide by 7. E.g. 7 cm snow = 10 mm precipitation water equivalent\nTotal cloud cover as an area fraction\nLow level clouds and fog up to 2 km altitude\nMid level clouds from 2 to 6 km altitude\nHigh level clouds from 6 km altitude\nShortwave solar radiation as average of the preceding hour. This is equal to the total global horizontal irradiation\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDirect solar radiation as average of the preceding hour on the horizontal plane and the normal plane (perpendicular to the sun)\nDiffuse solar radiation as average of the preceding hour\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind speed at 10 or 100 meters above ground. Wind speed on 10 meters is the standard level.\nWind direction at 10 or 100 meters above ground\nWind direction at 10 or 100 meters above ground\nGusts at 10 meters above ground of the indicated hour. Wind gusts in CERRA are defined as the maximum wind gusts of the preceding hour. Please consult the ECMWF IFS documentation for more information on how wind gusts are parameterized in weather models.\nET0 Reference Evapotranspiration of a well watered grass field. Based on FAO-56 Penman-Monteith equations ET0 is calculated from temperature, wind speed, humidity and solar radiation. Unlimited soil water is assumed. ET0 is commonly used to estimate the required irrigation for plants.\nWeather condition as a numeric code. Follow WMO weather interpretation codes. See table below for details. Weather code is calculated from cloud cover analysis, precipitation and snowfall. As barely no information about atmospheric stability is available, estimation about thunderstorms is not possible.\nVapor Pressure Deificit (VPD) in kilopascal (kPa). For high VPD (>1.6), water transpiration of plants increases. For low VPD (<0.4), transpiration decreases\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage temperature of different soil levels below ground.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths.\nAverage soil water content as volumetric mixing ratio at 0-7, 7-28, 28-100 and 100-255 cm depths."
dailyDescription <- "The most severe weather condition on a given day\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily air temperature at 2 meters above ground\nMaximum and minimum daily apparent temperature\nMaximum and minimum daily apparent temperature\nSum of daily precipitation (including rain, showers and snowfall)\nSum of daily rain\nSum of daily snowfall\nThe number of hours with rain\nSun rise and set times\nSun rise and set times\nMaximum wind speed and gusts on a day\nMaximum wind speed and gusts on a day\nDominant wind direction\nThe sum of solar radiaion on a given day in Megajoules\nDaily sum of ET0 Reference Evapotranspiration of a well watered grass field"

# Create tibble for hourly metrics
tblMetricsHourly <- tibble::tibble(metric=hourlyMetrics %>% str_split_1(","), 
                                   description=hourlyDescription %>% str_split_1("\n")
                                   )
tblMetricsHourly %>% 
    print(n=50)
## # A tibble: 33 × 2
##    metric                        description                                    
##    <chr>                         <chr>                                          
##  1 temperature_2m                Air temperature at 2 meters above ground       
##  2 relativehumidity_2m           Relative humidity at 2 meters above ground     
##  3 dewpoint_2m                   Dew point temperature at 2 meters above ground 
##  4 apparent_temperature          Apparent temperature is the perceived feels-li…
##  5 pressure_msl                  Atmospheric air pressure reduced to mean sea l…
##  6 surface_pressure              Atmospheric air pressure reduced to mean sea l…
##  7 precipitation                 Total precipitation (rain, showers, snow) sum …
##  8 rain                          Only liquid precipitation of the preceding hou…
##  9 snowfall                      Snowfall amount of the preceding hour in centi…
## 10 cloudcover                    Total cloud cover as an area fraction          
## 11 cloudcover_low                Low level clouds and fog up to 2 km altitude   
## 12 cloudcover_mid                Mid level clouds from 2 to 6 km altitude       
## 13 cloudcover_high               High level clouds from 6 km altitude           
## 14 shortwave_radiation           Shortwave solar radiation as average of the pr…
## 15 direct_radiation              Direct solar radiation as average of the prece…
## 16 direct_normal_irradiance      Direct solar radiation as average of the prece…
## 17 diffuse_radiation             Diffuse solar radiation as average of the prec…
## 18 windspeed_10m                 Wind speed at 10 or 100 meters above ground. W…
## 19 windspeed_100m                Wind speed at 10 or 100 meters above ground. W…
## 20 winddirection_10m             Wind direction at 10 or 100 meters above ground
## 21 winddirection_100m            Wind direction at 10 or 100 meters above ground
## 22 windgusts_10m                 Gusts at 10 meters above ground of the indicat…
## 23 et0_fao_evapotranspiration    ET0 Reference Evapotranspiration of a well wat…
## 24 weathercode                   Weather condition as a numeric code. Follow WM…
## 25 vapor_pressure_deficit        Vapor Pressure Deificit (VPD) in kilopascal (k…
## 26 soil_temperature_0_to_7cm     Average temperature of different soil levels b…
## 27 soil_temperature_7_to_28cm    Average temperature of different soil levels b…
## 28 soil_temperature_28_to_100cm  Average temperature of different soil levels b…
## 29 soil_temperature_100_to_255cm Average temperature of different soil levels b…
## 30 soil_moisture_0_to_7cm        Average soil water content as volumetric mixin…
## 31 soil_moisture_7_to_28cm       Average soil water content as volumetric mixin…
## 32 soil_moisture_28_to_100cm     Average soil water content as volumetric mixin…
## 33 soil_moisture_100_to_255cm    Average soil water content as volumetric mixin…
# Create tibble for daily metrics
tblMetricsDaily <- tibble::tibble(metric=dailyMetrics %>% str_split_1(","), 
                                  description=dailyDescription %>% str_split_1("\n")
                                   )
tblMetricsDaily
## # A tibble: 16 × 2
##    metric                     description                                       
##    <chr>                      <chr>                                             
##  1 weathercode                The most severe weather condition on a given day  
##  2 temperature_2m_max         Maximum and minimum daily air temperature at 2 me…
##  3 temperature_2m_min         Maximum and minimum daily air temperature at 2 me…
##  4 apparent_temperature_max   Maximum and minimum daily apparent temperature    
##  5 apparent_temperature_min   Maximum and minimum daily apparent temperature    
##  6 precipitation_sum          Sum of daily precipitation (including rain, showe…
##  7 rain_sum                   Sum of daily rain                                 
##  8 snowfall_sum               Sum of daily snowfall                             
##  9 precipitation_hours        The number of hours with rain                     
## 10 sunrise                    Sun rise and set times                            
## 11 sunset                     Sun rise and set times                            
## 12 windspeed_10m_max          Maximum wind speed and gusts on a day             
## 13 windgusts_10m_max          Maximum wind speed and gusts on a day             
## 14 winddirection_10m_dominant Dominant wind direction                           
## 15 shortwave_radiation_sum    The sum of solar radiaion on a given day in Megaj…
## 16 et0_fao_evapotranspiration Daily sum of ET0 Reference Evapotranspiration of …

A function is written to process saved data for later use:

formatOpenMeteoJSON <- function(x, 
                                glimpseData=TRUE, 
                                addVars=FALSE, 
                                addExtract="tblHourly", 
                                showStats=addVars
                                ) {
    
    # FUNCTION ARGUMENTS:
    # x: Saved json file for passage to readOpenMeteoJSON
    # glimpseData: boolean, should a glimpse of the file and metadata be shown?
    # addVars: boolean, should variables be added for later processing?
    # addExtract: list elemented to be extracted (relevant only for addVars=TRUE)
    # showStats: boolean, should counts of key elements be shown (relevant only for addVars=TRUE)

    # Read file
    lst <- readOpenMeteoJSON(x)
    
    # Show a glimpse if requested
    if(isTRUE(glimpseData)) {
        print(lst)
        prettyOpenMeteoMeta(lst)
    }
    
    # If no variables to be added, return the file
    if(!isTRUE(addVars)) return(lst)
    
    # Add statistics
    df <- lst[[addExtract]] %>%
        mutate(year=year(date), 
               month=factor(month.abb[lubridate::month(date)], levels=month.abb), 
               hour=lubridate::hour(time), 
               fct_hour=factor(hour), 
               tod=ifelse(hour>=7 & hour<=18, "Day", "Night"), 
               doy=yday(date),
               season=case_when(month %in% c("Mar", "Apr", "May") ~ "Spring", 
                                month %in% c("Jun", "Jul", "Aug") ~ "Summer", 
                                month %in% c("Sep", "Oct", "Nov") ~ "Fall", 
                                month %in% c("Dec", "Jan", "Feb") ~ "Winter", 
                                TRUE~"typo"
                                ), 
               todSeason=paste0(season, "-", tod), 
               tod=factor(tod, levels=c("Day", "Night")), 
               season=factor(season, levels=c("Spring", "Summer", "Fall", "Winter")), 
               todSeason=factor(todSeason, 
                                levels=paste0(rep(c("Spring", "Summer", "Fall", "Winter"), each=2), 
                                              "-", 
                                              c("Day", "Night")
                                              )
                                ),
               across(where(is.numeric), .fns=function(x) round(100*percent_rank(x)), .names="pct_{.col}")
               )
    
    # Show counts if requested
    if(isTRUE(showStats)) {
        # Glimpse file
        glimpse(df)
        # Counts of day-of-year/month
        p1 <- df %>% 
            count(doy, month) %>% 
            ggplot(aes(y=doy, x=month)) + 
            geom_boxplot(aes(weight=n), fill="lightblue") + 
            labs(title="Observations by day-of-year and month", x=NULL, y="Day of Year")
        print(p1)
        # Counts of year/month
        p2 <- df %>% 
            count(year, month) %>% 
            ggplot(aes(y=factor(year), x=month)) + 
            geom_tile(aes(fill=n)) + 
            geom_text(aes(label=n), size=3) + 
            scale_fill_continuous("# Records", low="white", high="green") + 
            labs(title="Records by year and month", x=NULL, y=NULL)
        print(p2)
        # Counts of todSeason-season-tod, hour-fct_hour-tod, and month-season
        df %>% count(todSeason, season, tod) %>% print()
        df %>% count(hour, fct_hour, tod) %>% print(n=30)
        df %>% count(month, season) %>% print()
    }
    
    # Return the file
    df
    
}

Core daily datasets are loaded:

# Read daily JSON file
nycOMDaily <- formatOpenMeteoJSON("testOM_daily_nyc.json")
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily 
## 
## $tblDaily
## # A tibble: 4,914 × 18
##    date       time       weathercode temperature_2m_max temperature_2m_min
##    <date>     <chr>            <int>              <dbl>              <dbl>
##  1 2010-01-01 2010-01-01          73                5                 -1.4
##  2 2010-01-02 2010-01-02          71               -0.6               -9.2
##  3 2010-01-03 2010-01-03          71               -4.8              -10  
##  4 2010-01-04 2010-01-04           1               -0.8               -7.3
##  5 2010-01-05 2010-01-05           1               -0.2               -7.3
##  6 2010-01-06 2010-01-06           2                1.1               -5.3
##  7 2010-01-07 2010-01-07           2                3.6               -3.7
##  8 2010-01-08 2010-01-08          71                1.9               -5.7
##  9 2010-01-09 2010-01-09           0               -1.4               -7.7
## 10 2010-01-10 2010-01-10           0               -1.7              -10.3
## # ℹ 4,904 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## #   apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## #   snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## #   windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## #   winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## #   et0_fao_evapotranspiration <dbl>
## 
## $tblHourly
## NULL
## 
## $tblUnits
## # A tibble: 17 × 4
##    metricType  name                       value      description                
##    <chr>       <chr>                      <chr>      <chr>                      
##  1 daily_units time                       "iso8601"  <NA>                       
##  2 daily_units weathercode                "wmo code" The most severe weather co…
##  3 daily_units temperature_2m_max         "deg C"    Maximum and minimum daily …
##  4 daily_units temperature_2m_min         "deg C"    Maximum and minimum daily …
##  5 daily_units apparent_temperature_max   "deg C"    Maximum and minimum daily …
##  6 daily_units apparent_temperature_min   "deg C"    Maximum and minimum daily …
##  7 daily_units precipitation_sum          "mm"       Sum of daily precipitation…
##  8 daily_units rain_sum                   "mm"       Sum of daily rain          
##  9 daily_units snowfall_sum               "cm"       Sum of daily snowfall      
## 10 daily_units precipitation_hours        "h"        The number of hours with r…
## 11 daily_units sunrise                    "iso8601"  Sun rise and set times     
## 12 daily_units sunset                     "iso8601"  Sun rise and set times     
## 13 daily_units windspeed_10m_max          "km/h"     Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max          "km/h"     Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg "     Dominant wind direction    
## 16 daily_units shortwave_radiation_sum    "MJ/m²"    The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm"       Daily sum of ET0 Reference…
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone        
##      <dbl>     <dbl>             <dbl>              <int> <chr>           
## 1     40.7     -73.9              101.             -14400 America/New_York
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 40.7
## longitude: -73.9
## generationtime_ms: 100.914
## utc_offset_seconds: -14400
## timezone: America/New_York
## timezone_abbreviation: EDT
## elevation: 36
laxOMDaily <- formatOpenMeteoJSON("testOM_daily_lax.json")
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily 
## 
## $tblDaily
## # A tibble: 5,113 × 18
##    date       time       weathercode temperature_2m_max temperature_2m_min
##    <date>     <chr>            <int>              <dbl>              <dbl>
##  1 2010-01-01 2010-01-01           2               20.1                4.7
##  2 2010-01-02 2010-01-02           1               23.2                6.7
##  3 2010-01-03 2010-01-03           1               23                  6.5
##  4 2010-01-04 2010-01-04           2               22.1                6.5
##  5 2010-01-05 2010-01-05           1               22.9                5  
##  6 2010-01-06 2010-01-06           2               23.2                7.7
##  7 2010-01-07 2010-01-07           1               23.3                5.2
##  8 2010-01-08 2010-01-08           1               22.8                8.4
##  9 2010-01-09 2010-01-09           2               21.5                7.2
## 10 2010-01-10 2010-01-10           1               24                  7.5
## # ℹ 5,103 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## #   apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## #   snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## #   windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## #   winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## #   et0_fao_evapotranspiration <dbl>
## 
## $tblHourly
## NULL
## 
## $tblUnits
## # A tibble: 17 × 4
##    metricType  name                       value      description                
##    <chr>       <chr>                      <chr>      <chr>                      
##  1 daily_units time                       "iso8601"  <NA>                       
##  2 daily_units weathercode                "wmo code" The most severe weather co…
##  3 daily_units temperature_2m_max         "deg C"    Maximum and minimum daily …
##  4 daily_units temperature_2m_min         "deg C"    Maximum and minimum daily …
##  5 daily_units apparent_temperature_max   "deg C"    Maximum and minimum daily …
##  6 daily_units apparent_temperature_min   "deg C"    Maximum and minimum daily …
##  7 daily_units precipitation_sum          "mm"       Sum of daily precipitation…
##  8 daily_units rain_sum                   "mm"       Sum of daily rain          
##  9 daily_units snowfall_sum               "cm"       Sum of daily snowfall      
## 10 daily_units precipitation_hours        "h"        The number of hours with r…
## 11 daily_units sunrise                    "iso8601"  Sun rise and set times     
## 12 daily_units sunset                     "iso8601"  Sun rise and set times     
## 13 daily_units windspeed_10m_max          "km/h"     Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max          "km/h"     Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg "     Dominant wind direction    
## 16 daily_units shortwave_radiation_sum    "MJ/m²"    The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm"       Daily sum of ET0 Reference…
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone           
##      <dbl>     <dbl>             <dbl>              <int> <chr>              
## 1     34.1     -118.              58.9             -25200 America/Los_Angeles
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 34.13005
## longitude: -118.4981
## generationtime_ms: 58.85398
## utc_offset_seconds: -25200
## timezone: America/Los_Angeles
## timezone_abbreviation: PDT
## elevation: 333
chiOMDaily <- formatOpenMeteoJSON("testOM_daily_chi.json")
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily 
## 
## $tblDaily
## # A tibble: 5,113 × 18
##    date       time       weathercode temperature_2m_max temperature_2m_min
##    <date>     <chr>            <int>              <dbl>              <dbl>
##  1 2010-01-01 2010-01-01           3               -8.6              -13.4
##  2 2010-01-02 2010-01-02           2              -10.4              -15.1
##  3 2010-01-03 2010-01-03           3               -7.9              -13.8
##  4 2010-01-04 2010-01-04           3               -6.9              -12.3
##  5 2010-01-05 2010-01-05           3               -4.8               -9.8
##  6 2010-01-06 2010-01-06          71               -4.9               -9  
##  7 2010-01-07 2010-01-07          73               -5.2               -8.5
##  8 2010-01-08 2010-01-08          73               -3                 -9.4
##  9 2010-01-09 2010-01-09           3               -5.8              -12.3
## 10 2010-01-10 2010-01-10           3               -8.8              -19.4
## # ℹ 5,103 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## #   apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## #   snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## #   windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## #   winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## #   et0_fao_evapotranspiration <dbl>
## 
## $tblHourly
## NULL
## 
## $tblUnits
## # A tibble: 17 × 4
##    metricType  name                       value      description                
##    <chr>       <chr>                      <chr>      <chr>                      
##  1 daily_units time                       "iso8601"  <NA>                       
##  2 daily_units weathercode                "wmo code" The most severe weather co…
##  3 daily_units temperature_2m_max         "deg C"    Maximum and minimum daily …
##  4 daily_units temperature_2m_min         "deg C"    Maximum and minimum daily …
##  5 daily_units apparent_temperature_max   "deg C"    Maximum and minimum daily …
##  6 daily_units apparent_temperature_min   "deg C"    Maximum and minimum daily …
##  7 daily_units precipitation_sum          "mm"       Sum of daily precipitation…
##  8 daily_units rain_sum                   "mm"       Sum of daily rain          
##  9 daily_units snowfall_sum               "cm"       Sum of daily snowfall      
## 10 daily_units precipitation_hours        "h"        The number of hours with r…
## 11 daily_units sunrise                    "iso8601"  Sun rise and set times     
## 12 daily_units sunset                     "iso8601"  Sun rise and set times     
## 13 daily_units windspeed_10m_max          "km/h"     Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max          "km/h"     Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg "     Dominant wind direction    
## 16 daily_units shortwave_radiation_sum    "MJ/m²"    The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm"       Daily sum of ET0 Reference…
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone       
##      <dbl>     <dbl>             <dbl>              <int> <chr>          
## 1     41.9     -87.6              59.4             -18000 America/Chicago
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 41.86292
## longitude: -87.64877
## generationtime_ms: 59.38601
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
houOMDaily <- formatOpenMeteoJSON("testOM_daily_hou.json")
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily 
## 
## $tblDaily
## # A tibble: 5,113 × 18
##    date       time       weathercode temperature_2m_max temperature_2m_min
##    <date>     <chr>            <int>              <dbl>              <dbl>
##  1 2010-01-01 2010-01-01           3               11.8                3.9
##  2 2010-01-02 2010-01-02           1               12                  0.7
##  3 2010-01-03 2010-01-03           3               10                  4.4
##  4 2010-01-04 2010-01-04           3                7.6                1.8
##  5 2010-01-05 2010-01-05           0                8                 -1.9
##  6 2010-01-06 2010-01-06          51               12.7               -0.1
##  7 2010-01-07 2010-01-07          55               13.4               -0.2
##  8 2010-01-08 2010-01-08           2                0.8               -3  
##  9 2010-01-09 2010-01-09           0                4.4               -5.5
## 10 2010-01-10 2010-01-10           0                5.9               -4.6
## # ℹ 5,103 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## #   apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## #   snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## #   windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## #   winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## #   et0_fao_evapotranspiration <dbl>
## 
## $tblHourly
## NULL
## 
## $tblUnits
## # A tibble: 17 × 4
##    metricType  name                       value      description                
##    <chr>       <chr>                      <chr>      <chr>                      
##  1 daily_units time                       "iso8601"  <NA>                       
##  2 daily_units weathercode                "wmo code" The most severe weather co…
##  3 daily_units temperature_2m_max         "deg C"    Maximum and minimum daily …
##  4 daily_units temperature_2m_min         "deg C"    Maximum and minimum daily …
##  5 daily_units apparent_temperature_max   "deg C"    Maximum and minimum daily …
##  6 daily_units apparent_temperature_min   "deg C"    Maximum and minimum daily …
##  7 daily_units precipitation_sum          "mm"       Sum of daily precipitation…
##  8 daily_units rain_sum                   "mm"       Sum of daily rain          
##  9 daily_units snowfall_sum               "cm"       Sum of daily snowfall      
## 10 daily_units precipitation_hours        "h"        The number of hours with r…
## 11 daily_units sunrise                    "iso8601"  Sun rise and set times     
## 12 daily_units sunset                     "iso8601"  Sun rise and set times     
## 13 daily_units windspeed_10m_max          "km/h"     Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max          "km/h"     Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg "     Dominant wind direction    
## 16 daily_units shortwave_radiation_sum    "MJ/m²"    The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm"       Daily sum of ET0 Reference…
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone  
##      <dbl>     <dbl>             <dbl>              <int> <chr>     
## 1     29.8     -95.4              64.0             -18000 US/Central
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 29.77153
## longitude: -95.43555
## generationtime_ms: 63.96198
## utc_offset_seconds: -18000
## timezone: US/Central
## timezone_abbreviation: CDT
## elevation: 17

Processed hourly data for NYC and LA are loaded:

# Read hourly JSON file (NYC and LA)
nycTemp <- formatOpenMeteoJSON("testOM_hourly_nyc.json", addVars=TRUE)
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly 
## 
## $tblDaily
## NULL
## 
## $tblHourly
## # A tibble: 117,936 × 37
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2010-01-01 00:00:00 2010-01-01     0           -1.1                  95
##  2 2010-01-01 01:00:00 2010-01-01     1           -1                    96
##  3 2010-01-01 02:00:00 2010-01-01     2           -1                    96
##  4 2010-01-01 03:00:00 2010-01-01     3           -0.8                  97
##  5 2010-01-01 04:00:00 2010-01-01     4           -0.9                  97
##  6 2010-01-01 05:00:00 2010-01-01     5           -0.8                  97
##  7 2010-01-01 06:00:00 2010-01-01     6           -0.7                  97
##  8 2010-01-01 07:00:00 2010-01-01     7           -0.5                  97
##  9 2010-01-01 08:00:00 2010-01-01     8           -0.6                  97
## 10 2010-01-01 09:00:00 2010-01-01     9           -0.6                  97
## # ℹ 117,926 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $tblUnits
## # A tibble: 34 × 4
##    metricType   name                 value   description                        
##    <chr>        <chr>                <chr>   <chr>                              
##  1 hourly_units time                 iso8601 <NA>                               
##  2 hourly_units temperature_2m       deg C   Air temperature at 2 meters above …
##  3 hourly_units relativehumidity_2m  %       Relative humidity at 2 meters abov…
##  4 hourly_units dewpoint_2m          deg C   Dew point temperature at 2 meters …
##  5 hourly_units apparent_temperature deg C   Apparent temperature is the percei…
##  6 hourly_units pressure_msl         hPa     Atmospheric air pressure reduced t…
##  7 hourly_units surface_pressure     hPa     Atmospheric air pressure reduced t…
##  8 hourly_units precipitation        mm      Total precipitation (rain, showers…
##  9 hourly_units rain                 mm      Only liquid precipitation of the p…
## 10 hourly_units snowfall             cm      Snowfall amount of the preceding h…
## # ℹ 24 more rows
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone        
##      <dbl>     <dbl>             <dbl>              <int> <chr>           
## 1     40.7     -73.9              118.             -14400 America/New_York
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 40.7
## longitude: -73.9
## generationtime_ms: 118.0021
## utc_offset_seconds: -14400
## timezone: America/New_York
## timezone_abbreviation: EDT
## elevation: 36
## 
## Rows: 117,936
## Columns: 80
## $ time                              <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date                              <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour                              <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m                    <dbl> -1.1, -1.0, -1.0, -0.8, -0.9, -0.8, …
## $ relativehumidity_2m               <int> 95, 96, 96, 97, 97, 97, 97, 97, 97, …
## $ dewpoint_2m                       <dbl> -1.7, -1.6, -1.6, -1.2, -1.3, -1.2, …
## $ apparent_temperature              <dbl> -3.9, -3.9, -3.9, -3.7, -3.7, -3.6, …
## $ pressure_msl                      <dbl> 1017.2, 1016.5, 1015.9, 1015.6, 1015…
## $ surface_pressure                  <dbl> 1012.6, 1011.9, 1011.3, 1011.0, 1011…
## $ precipitation                     <dbl> 0.5, 0.5, 0.4, 0.3, 0.1, 0.0, 0.0, 0…
## $ rain                              <dbl> 0.0, 0.1, 0.1, 0.1, 0.0, 0.0, 0.0, 0…
## $ snowfall                          <dbl> 0.35, 0.28, 0.21, 0.14, 0.07, 0.00, …
## $ cloudcover                        <int> 90, 93, 80, 68, 71, 100, 100, 100, 1…
## $ cloudcover_low                    <int> 2, 8, 3, 6, 15, 51, 99, 99, 96, 77, …
## $ cloudcover_mid                    <int> 98, 96, 99, 98, 95, 97, 98, 99, 94, …
## $ cloudcover_high                   <int> 97, 93, 59, 13, 0, 0, 0, 0, 0, 0, 0,…
## $ shortwave_radiation               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 53, 11…
## $ direct_radiation                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 20…
## $ direct_normal_irradiance          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 41, 93…
## $ windspeed_10m                     <dbl> 3.1, 3.5, 3.3, 3.9, 3.5, 3.4, 0.0, 1…
## $ windspeed_100m                    <dbl> 3.8, 3.1, 3.8, 4.7, 6.4, 5.7, 1.4, 1…
## $ winddirection_10m                 <int> 339, 336, 347, 338, 336, 342, 180, 2…
## $ winddirection_100m                <int> 41, 21, 17, 356, 344, 342, 360, 217,…
## $ windgusts_10m                     <dbl> 9.0, 9.7, 10.1, 7.6, 7.6, 6.8, 5.4, …
## $ et0_fao_evapotranspiration        <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, …
## $ weathercode                       <int> 73, 73, 73, 71, 71, 3, 3, 3, 3, 3, 3…
## $ vapor_pressure_deficit            <dbl> 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, …
## $ soil_temperature_0_to_7cm         <dbl> -0.7, -0.7, -0.7, -0.6, -0.6, -0.6, …
## $ soil_temperature_7_to_28cm        <dbl> 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0…
## $ soil_temperature_28_to_100cm      <dbl> 4.2, 4.2, 4.1, 4.1, 4.1, 4.1, 4.1, 4…
## $ soil_temperature_100_to_255cm     <dbl> 10.6, 10.6, 10.6, 10.6, 10.6, 10.6, …
## $ soil_moisture_0_to_7cm            <dbl> 0.373, 0.374, 0.376, 0.377, 0.377, 0…
## $ soil_moisture_7_to_28cm           <dbl> 0.377, 0.377, 0.377, 0.377, 0.377, 0…
## $ soil_moisture_28_to_100cm         <dbl> 0.413, 0.413, 0.413, 0.413, 0.413, 0…
## $ soil_moisture_100_to_255cm        <dbl> 0.412, 0.412, 0.412, 0.412, 0.412, 0…
## $ origTime                          <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year                              <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month                             <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour                          <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod                               <fct> Night, Night, Night, Night, Night, N…
## $ doy                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season                            <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason                         <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour                          <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m                <dbl> 10, 10, 10, 11, 11, 11, 11, 12, 11, …
## $ pct_relativehumidity_2m           <dbl> 92, 94, 94, 96, 96, 96, 96, 96, 96, …
## $ pct_dewpoint_2m                   <dbl> 23, 24, 24, 25, 25, 25, 25, 25, 25, …
## $ pct_apparent_temperature          <dbl> 15, 15, 15, 15, 15, 15, 17, 17, 16, …
## $ pct_pressure_msl                  <dbl> 53, 49, 46, 44, 44, 41, 38, 36, 37, …
## $ pct_surface_pressure              <dbl> 51, 47, 44, 42, 42, 39, 36, 35, 36, …
## $ pct_precipitation                 <dbl> 93, 93, 92, 90, 86, 0, 0, 0, 0, 0, 0…
## $ pct_rain                          <dbl> 0, 87, 87, 87, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall                      <dbl> 99, 99, 99, 99, 98, 0, 0, 0, 0, 0, 0…
## $ pct_cloudcover                    <dbl> 77, 79, 72, 67, 68, 81, 81, 81, 81, …
## $ pct_cloudcover_low                <dbl> 51, 60, 53, 58, 65, 77, 90, 90, 88, …
## $ pct_cloudcover_mid                <dbl> 90, 89, 92, 90, 88, 89, 90, 92, 87, …
## $ pct_cloudcover_high               <dbl> 81, 76, 63, 49, 0, 0, 0, 0, 0, 0, 0,…
## $ pct_shortwave_radiation           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 57, 6…
## $ pct_direct_radiation              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 62…
## $ pct_direct_normal_irradiance      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61, 61…
## $ pct_diffuse_radiation             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 58, 7…
## $ pct_windspeed_10m                 <dbl> 3, 4, 3, 5, 4, 4, 0, 1, 2, 5, 8, 8, …
## $ pct_windspeed_100m                <dbl> 2, 1, 2, 3, 6, 5, 0, 0, 4, 9, 9, 8, …
## $ pct_winddirection_10m             <dbl> 94, 93, 96, 94, 93, 95, 35, 43, 53, …
## $ pct_winddirection_100m            <dbl> 8, 4, 3, 99, 96, 95, 100, 46, 51, 61…
## $ pct_windgusts_10m                 <dbl> 3, 4, 5, 1, 1, 1, 0, 0, 0, 1, 2, 4, …
## $ pct_et0_fao_evapotranspiration    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 32, 4…
## $ pct_weathercode                   <dbl> 99, 99, 99, 98, 98, 69, 69, 69, 69, …
## $ pct_vapor_pressure_deficit        <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 8, …
## $ pct_soil_temperature_0_to_7cm     <dbl> 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 9, 10,…
## $ pct_soil_temperature_7_to_28cm    <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, …
## $ pct_soil_temperature_28_to_100cm  <dbl> 16, 16, 15, 15, 15, 15, 15, 15, 15, …
## $ pct_soil_temperature_100_to_255cm <dbl> 42, 42, 42, 42, 42, 42, 42, 42, 42, …
## $ pct_soil_moisture_0_to_7cm        <dbl> 70, 71, 73, 74, 74, 74, 74, 74, 73, …
## $ pct_soil_moisture_7_to_28cm       <dbl> 69, 69, 69, 69, 69, 68, 68, 68, 68, …
## $ pct_soil_moisture_28_to_100cm     <dbl> 96, 96, 96, 96, 96, 96, 96, 96, 96, …
## $ pct_soil_moisture_100_to_255cm    <dbl> 96, 96, 96, 96, 96, 96, 96, 96, 96, …
## $ pct_year                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

## # A tibble: 8 × 4
##   todSeason    season tod       n
##   <fct>        <fct>  <fct> <int>
## 1 Spring-Day   Spring Day   15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day   Summer Day   14532
## 4 Summer-Night Summer Night 14532
## 5 Fall-Day     Fall   Day   14196
## 6 Fall-Night   Fall   Night 14196
## 7 Winter-Day   Winter Day   14784
## 8 Winter-Night Winter Night 14784
## # A tibble: 24 × 4
##     hour fct_hour tod       n
##    <int> <fct>    <fct> <int>
##  1     0 0        Night  4914
##  2     1 1        Night  4914
##  3     2 2        Night  4914
##  4     3 3        Night  4914
##  5     4 4        Night  4914
##  6     5 5        Night  4914
##  7     6 6        Night  4914
##  8     7 7        Day    4914
##  9     8 8        Day    4914
## 10     9 9        Day    4914
## 11    10 10       Day    4914
## 12    11 11       Day    4914
## 13    12 12       Day    4914
## 14    13 13       Day    4914
## 15    14 14       Day    4914
## 16    15 15       Day    4914
## 17    16 16       Day    4914
## 18    17 17       Day    4914
## 19    18 18       Day    4914
## 20    19 19       Night  4914
## 21    20 20       Night  4914
## 22    21 21       Night  4914
## 23    22 22       Night  4914
## 24    23 23       Night  4914
## # A tibble: 12 × 3
##    month season     n
##    <fct> <fct>  <int>
##  1 Jan   Winter 10416
##  2 Feb   Winter  9480
##  3 Mar   Spring 10416
##  4 Apr   Spring 10080
##  5 May   Spring 10416
##  6 Jun   Summer  9720
##  7 Jul   Summer  9672
##  8 Aug   Summer  9672
##  9 Sep   Fall    9360
## 10 Oct   Fall    9672
## 11 Nov   Fall    9360
## 12 Dec   Winter  9672
laxTemp <- formatOpenMeteoJSON("testOM_hourly_lax.json", addVars=TRUE)
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly 
## 
## $tblDaily
## NULL
## 
## $tblHourly
## # A tibble: 122,712 × 37
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2010-01-01 00:00:00 2010-01-01     0            6.3                  60
##  2 2010-01-01 01:00:00 2010-01-01     1            5.7                  62
##  3 2010-01-01 02:00:00 2010-01-01     2            5.3                  63
##  4 2010-01-01 03:00:00 2010-01-01     3            5                    64
##  5 2010-01-01 04:00:00 2010-01-01     4            4.8                  64
##  6 2010-01-01 05:00:00 2010-01-01     5            4.7                  64
##  7 2010-01-01 06:00:00 2010-01-01     6            4.7                  64
##  8 2010-01-01 07:00:00 2010-01-01     7            4.8                  64
##  9 2010-01-01 08:00:00 2010-01-01     8            5.2                  64
## 10 2010-01-01 09:00:00 2010-01-01     9            6.3                  63
## # ℹ 122,702 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $tblUnits
## # A tibble: 34 × 4
##    metricType   name                 value   description                        
##    <chr>        <chr>                <chr>   <chr>                              
##  1 hourly_units time                 iso8601 <NA>                               
##  2 hourly_units temperature_2m       deg C   Air temperature at 2 meters above …
##  3 hourly_units relativehumidity_2m  %       Relative humidity at 2 meters abov…
##  4 hourly_units dewpoint_2m          deg C   Dew point temperature at 2 meters …
##  5 hourly_units apparent_temperature deg C   Apparent temperature is the percei…
##  6 hourly_units pressure_msl         hPa     Atmospheric air pressure reduced t…
##  7 hourly_units surface_pressure     hPa     Atmospheric air pressure reduced t…
##  8 hourly_units precipitation        mm      Total precipitation (rain, showers…
##  9 hourly_units rain                 mm      Only liquid precipitation of the p…
## 10 hourly_units snowfall             cm      Snowfall amount of the preceding h…
## # ℹ 24 more rows
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone           
##      <dbl>     <dbl>             <dbl>              <int> <chr>              
## 1     34.1     -118.             6196.             -25200 America/Los_Angeles
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 34.13005
## longitude: -118.4981
## generationtime_ms: 6196.377
## utc_offset_seconds: -25200
## timezone: America/Los_Angeles
## timezone_abbreviation: PDT
## elevation: 333
## 
## Rows: 122,712
## Columns: 80
## $ time                              <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date                              <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour                              <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m                    <dbl> 6.3, 5.7, 5.3, 5.0, 4.8, 4.7, 4.7, 4…
## $ relativehumidity_2m               <int> 60, 62, 63, 64, 64, 64, 64, 64, 64, …
## $ dewpoint_2m                       <dbl> -0.9, -1.0, -1.2, -1.3, -1.4, -1.4, …
## $ apparent_temperature              <dbl> 2.9, 2.3, 1.8, 1.3, 1.0, 0.9, 0.9, 1…
## $ pressure_msl                      <dbl> 1026.5, 1026.1, 1025.7, 1025.7, 1024…
## $ surface_pressure                  <dbl> 985.7, 985.2, 984.8, 984.7, 983.9, 9…
## $ precipitation                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rain                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ snowfall                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover                        <int> 14, 21, 23, 29, 31, 30, 29, 30, 31, …
## $ cloudcover_low                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover_mid                    <int> 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 2, 6, …
## $ cloudcover_high                   <int> 48, 71, 78, 95, 100, 99, 98, 99, 100…
## $ shortwave_radiation               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 142, …
## $ direct_radiation                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 27, 16…
## $ direct_normal_irradiance          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 115, …
## $ windspeed_10m                     <dbl> 7.4, 7.8, 8.0, 9.7, 9.7, 10.1, 10.0,…
## $ windspeed_100m                    <dbl> 10.4, 10.6, 11.0, 14.9, 14.8, 14.6, …
## $ winddirection_10m                 <int> 14, 13, 10, 15, 15, 17, 15, 13, 13, …
## $ winddirection_100m                <int> 20, 24, 19, 20, 18, 20, 18, 18, 16, …
## $ windgusts_10m                     <dbl> 19.1, 19.1, 19.4, 19.8, 20.9, 21.6, …
## $ et0_fao_evapotranspiration        <dbl> 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, …
## $ weathercode                       <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ vapor_pressure_deficit            <dbl> 0.38, 0.35, 0.33, 0.31, 0.31, 0.31, …
## $ soil_temperature_0_to_7cm         <dbl> 7.0, 6.6, 6.2, 5.8, 5.6, 5.4, 5.3, 5…
## $ soil_temperature_7_to_28cm        <dbl> 10.8, 10.6, 10.3, 10.1, 9.9, 9.7, 9.…
## $ soil_temperature_28_to_100cm      <dbl> 12.9, 12.9, 12.9, 12.9, 12.9, 12.9, …
## $ soil_temperature_100_to_255cm     <dbl> 20.5, 20.5, 20.5, 20.5, 20.5, 20.5, …
## $ soil_moisture_0_to_7cm            <dbl> 0.205, 0.205, 0.205, 0.205, 0.205, 0…
## $ soil_moisture_7_to_28cm           <dbl> 0.251, 0.251, 0.251, 0.250, 0.250, 0…
## $ soil_moisture_28_to_100cm         <dbl> 0.168, 0.168, 0.168, 0.168, 0.168, 0…
## $ soil_moisture_100_to_255cm        <dbl> 0.165, 0.165, 0.165, 0.165, 0.165, 0…
## $ origTime                          <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year                              <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month                             <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour                          <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod                               <fct> Night, Night, Night, Night, Night, N…
## $ doy                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season                            <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason                         <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour                          <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m                <dbl> 4, 3, 3, 2, 2, 2, 2, 2, 3, 4, 12, 34…
## $ pct_relativehumidity_2m           <dbl> 52, 54, 55, 57, 57, 57, 57, 57, 57, …
## $ pct_dewpoint_2m                   <dbl> 15, 15, 15, 14, 14, 14, 14, 14, 15, …
## $ pct_apparent_temperature          <dbl> 4, 3, 3, 2, 2, 2, 2, 2, 2, 4, 10, 28…
## $ pct_pressure_msl                  <dbl> 100, 100, 99, 99, 99, 99, 98, 98, 98…
## $ pct_surface_pressure              <dbl> 99, 99, 99, 99, 98, 98, 97, 97, 97, …
## $ pct_precipitation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_rain                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover                    <dbl> 58, 63, 65, 71, 75, 73, 71, 73, 75, …
## $ pct_cloudcover_low                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover_mid                <dbl> 0, 0, 0, 0, 76, 0, 0, 0, 78, 80, 78,…
## $ pct_cloudcover_high               <dbl> 80, 84, 85, 91, 96, 95, 94, 95, 96, …
## $ pct_shortwave_radiation           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 59, 6…
## $ pct_direct_radiation              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 55, 6…
## $ pct_direct_normal_irradiance      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 54, 6…
## $ pct_diffuse_radiation             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 86, 9…
## $ pct_windspeed_10m                 <dbl> 61, 64, 65, 77, 77, 79, 79, 79, 79, …
## $ pct_windspeed_100m                <dbl> 60, 61, 63, 81, 80, 80, 79, 79, 78, …
## $ pct_winddirection_10m             <dbl> 6, 5, 3, 7, 7, 8, 7, 5, 5, 5, 7, 9, …
## $ pct_winddirection_100m            <dbl> 8, 10, 8, 8, 7, 8, 7, 7, 6, 4, 4, 4,…
## $ pct_windgusts_10m                 <dbl> 51, 51, 52, 53, 56, 58, 58, 59, 58, …
## $ pct_et0_fao_evapotranspiration    <dbl> 34, 34, 34, 34, 34, 34, 34, 34, 34, …
## $ pct_weathercode                   <dbl> 0, 63, 63, 63, 63, 63, 63, 63, 63, 6…
## $ pct_vapor_pressure_deficit        <dbl> 31, 29, 28, 27, 27, 27, 26, 27, 27, …
## $ pct_soil_temperature_0_to_7cm     <dbl> 3, 3, 2, 2, 2, 1, 1, 1, 1, 2, 5, 15,…
## $ pct_soil_temperature_7_to_28cm    <dbl> 6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 2, …
## $ pct_soil_temperature_28_to_100cm  <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ pct_soil_temperature_100_to_255cm <dbl> 64, 64, 64, 64, 64, 64, 64, 64, 64, …
## $ pct_soil_moisture_0_to_7cm        <dbl> 83, 83, 83, 83, 83, 83, 83, 83, 83, …
## $ pct_soil_moisture_7_to_28cm       <dbl> 87, 87, 87, 87, 87, 87, 87, 87, 87, …
## $ pct_soil_moisture_28_to_100cm     <dbl> 56, 56, 56, 56, 56, 56, 56, 56, 56, …
## $ pct_soil_moisture_100_to_255cm    <dbl> 34, 34, 34, 34, 34, 34, 34, 34, 34, …
## $ pct_year                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

## # A tibble: 8 × 4
##   todSeason    season tod       n
##   <fct>        <fct>  <fct> <int>
## 1 Spring-Day   Spring Day   15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day   Summer Day   15456
## 4 Summer-Night Summer Night 15456
## 5 Fall-Day     Fall   Day   15288
## 6 Fall-Night   Fall   Night 15288
## 7 Winter-Day   Winter Day   15156
## 8 Winter-Night Winter Night 15156
## # A tibble: 24 × 4
##     hour fct_hour tod       n
##    <int> <fct>    <fct> <int>
##  1     0 0        Night  5113
##  2     1 1        Night  5113
##  3     2 2        Night  5113
##  4     3 3        Night  5113
##  5     4 4        Night  5113
##  6     5 5        Night  5113
##  7     6 6        Night  5113
##  8     7 7        Day    5113
##  9     8 8        Day    5113
## 10     9 9        Day    5113
## 11    10 10       Day    5113
## 12    11 11       Day    5113
## 13    12 12       Day    5113
## 14    13 13       Day    5113
## 15    14 14       Day    5113
## 16    15 15       Day    5113
## 17    16 16       Day    5113
## 18    17 17       Day    5113
## 19    18 18       Day    5113
## 20    19 19       Night  5113
## 21    20 20       Night  5113
## 22    21 21       Night  5113
## 23    22 22       Night  5113
## 24    23 23       Night  5113
## # A tibble: 12 × 3
##    month season     n
##    <fct> <fct>  <int>
##  1 Jan   Winter 10416
##  2 Feb   Winter  9480
##  3 Mar   Spring 10416
##  4 Apr   Spring 10080
##  5 May   Spring 10416
##  6 Jun   Summer 10080
##  7 Jul   Summer 10416
##  8 Aug   Summer 10416
##  9 Sep   Fall   10080
## 10 Oct   Fall   10416
## 11 Nov   Fall   10080
## 12 Dec   Winter 10416

Processed hourly data for Chicago and Houston are loaded:

# Read hourly JSON file (CHI and HOU)
chiTemp <- formatOpenMeteoJSON("testOM_hourly_chi.json", addVars=TRUE)
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly 
## 
## $tblDaily
## NULL
## 
## $tblHourly
## # A tibble: 122,712 × 37
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2010-01-01 00:00:00 2010-01-01     0           -9.5                  67
##  2 2010-01-01 01:00:00 2010-01-01     1           -9.8                  69
##  3 2010-01-01 02:00:00 2010-01-01     2          -10.3                  73
##  4 2010-01-01 03:00:00 2010-01-01     3          -10.8                  74
##  5 2010-01-01 04:00:00 2010-01-01     4          -11.3                  75
##  6 2010-01-01 05:00:00 2010-01-01     5          -11.8                  76
##  7 2010-01-01 06:00:00 2010-01-01     6          -12.3                  77
##  8 2010-01-01 07:00:00 2010-01-01     7          -12.8                  78
##  9 2010-01-01 08:00:00 2010-01-01     8          -13.2                  79
## 10 2010-01-01 09:00:00 2010-01-01     9          -13.4                  78
## # ℹ 122,702 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $tblUnits
## # A tibble: 34 × 4
##    metricType   name                 value   description                        
##    <chr>        <chr>                <chr>   <chr>                              
##  1 hourly_units time                 iso8601 <NA>                               
##  2 hourly_units temperature_2m       deg C   Air temperature at 2 meters above …
##  3 hourly_units relativehumidity_2m  %       Relative humidity at 2 meters abov…
##  4 hourly_units dewpoint_2m          deg C   Dew point temperature at 2 meters …
##  5 hourly_units apparent_temperature deg C   Apparent temperature is the percei…
##  6 hourly_units pressure_msl         hPa     Atmospheric air pressure reduced t…
##  7 hourly_units surface_pressure     hPa     Atmospheric air pressure reduced t…
##  8 hourly_units precipitation        mm      Total precipitation (rain, showers…
##  9 hourly_units rain                 mm      Only liquid precipitation of the p…
## 10 hourly_units snowfall             cm      Snowfall amount of the preceding h…
## # ℹ 24 more rows
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone       
##      <dbl>     <dbl>             <dbl>              <int> <chr>          
## 1     41.9     -87.6             4476.             -18000 America/Chicago
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 41.86292
## longitude: -87.64877
## generationtime_ms: 4476.2
## utc_offset_seconds: -18000
## timezone: America/Chicago
## timezone_abbreviation: CDT
## elevation: 180
## 
## Rows: 122,712
## Columns: 80
## $ time                              <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date                              <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour                              <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m                    <dbl> -9.5, -9.8, -10.3, -10.8, -11.3, -11…
## $ relativehumidity_2m               <int> 67, 69, 73, 74, 75, 76, 77, 78, 79, …
## $ dewpoint_2m                       <dbl> -14.4, -14.4, -14.2, -14.5, -14.8, -…
## $ apparent_temperature              <dbl> -15.8, -16.3, -16.8, -17.2, -17.7, -…
## $ pressure_msl                      <dbl> 1024.4, 1024.7, 1025.3, 1025.8, 1026…
## $ surface_pressure                  <dbl> 1000.8, 1001.1, 1001.6, 1002.1, 1002…
## $ precipitation                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rain                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ snowfall                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover                        <int> 62, 47, 20, 15, 15, 19, 25, 22, 22, …
## $ cloudcover_low                    <int> 69, 52, 22, 17, 17, 21, 28, 25, 25, …
## $ cloudcover_mid                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, …
## $ cloudcover_high                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shortwave_radiation               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 119, …
## $ direct_radiation                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 69, 14…
## $ direct_normal_irradiance          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 50, 7…
## $ windspeed_10m                     <dbl> 18.7, 20.1, 19.9, 19.5, 19.0, 19.4, …
## $ windspeed_100m                    <dbl> 25.9, 28.4, 29.2, 29.8, 30.1, 30.0, …
## $ winddirection_10m                 <int> 298, 291, 290, 289, 289, 288, 287, 2…
## $ winddirection_100m                <int> 299, 294, 294, 295, 295, 294, 295, 2…
## $ windgusts_10m                     <dbl> 33.8, 32.4, 34.2, 33.1, 31.3, 31.7, …
## $ et0_fao_evapotranspiration        <dbl> 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, …
## $ weathercode                       <int> 2, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ vapor_pressure_deficit            <dbl> 0.10, 0.09, 0.08, 0.07, 0.06, 0.06, …
## $ soil_temperature_0_to_7cm         <dbl> -1.5, -1.6, -1.8, -1.9, -2.1, -2.3, …
## $ soil_temperature_7_to_28cm        <dbl> -0.4, -0.4, -0.4, -0.4, -0.4, -0.4, …
## $ soil_temperature_28_to_100cm      <dbl> 2.4, 2.4, 2.4, 2.4, 2.3, 2.3, 2.3, 2…
## $ soil_temperature_100_to_255cm     <dbl> 9.0, 9.0, 9.0, 9.0, 8.9, 8.9, 8.9, 8…
## $ soil_moisture_0_to_7cm            <dbl> 0.295, 0.295, 0.294, 0.294, 0.294, 0…
## $ soil_moisture_7_to_28cm           <dbl> 0.300, 0.300, 0.300, 0.300, 0.300, 0…
## $ soil_moisture_28_to_100cm         <dbl> 0.334, 0.334, 0.334, 0.334, 0.334, 0…
## $ soil_moisture_100_to_255cm        <dbl> 0.310, 0.310, 0.310, 0.310, 0.311, 0…
## $ origTime                          <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year                              <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month                             <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour                          <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod                               <fct> Night, Night, Night, Night, Night, N…
## $ doy                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season                            <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason                         <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour                          <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m                <dbl> 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, …
## $ pct_relativehumidity_2m           <dbl> 33, 37, 46, 48, 50, 52, 55, 57, 59, …
## $ pct_dewpoint_2m                   <dbl> 4, 4, 5, 4, 4, 4, 4, 4, 3, 3, 3, 4, …
## $ pct_apparent_temperature          <dbl> 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, …
## $ pct_pressure_msl                  <dbl> 84, 85, 86, 88, 89, 89, 90, 91, 91, …
## $ pct_surface_pressure              <dbl> 80, 81, 83, 85, 85, 86, 87, 89, 89, …
## $ pct_precipitation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_rain                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover                    <dbl> 62, 55, 33, 30, 30, 33, 37, 35, 35, …
## $ pct_cloudcover_low                <dbl> 77, 74, 66, 64, 64, 66, 68, 67, 67, …
## $ pct_cloudcover_mid                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0,…
## $ pct_cloudcover_high               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_shortwave_radiation           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 63, 7…
## $ pct_direct_radiation              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 57, 69, 7…
## $ pct_direct_normal_irradiance      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 76, 8…
## $ pct_diffuse_radiation             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 59, 6…
## $ pct_windspeed_10m                 <dbl> 66, 72, 71, 70, 68, 69, 65, 63, 59, …
## $ pct_windspeed_100m                <dbl> 59, 67, 69, 71, 72, 72, 67, 63, 61, …
## $ pct_winddirection_10m             <dbl> 87, 85, 84, 84, 84, 84, 83, 83, 83, …
## $ pct_winddirection_100m            <dbl> 86, 85, 85, 85, 85, 85, 85, 85, 84, …
## $ pct_windgusts_10m                 <dbl> 69, 65, 70, 67, 62, 63, 63, 61, 59, …
## $ pct_et0_fao_evapotranspiration    <dbl> 27, 16, 16, 16, 16, 16, 16, 16, 16, …
## $ pct_weathercode                   <dbl> 55, 34, 0, 0, 0, 0, 34, 34, 34, 0, 3…
## $ pct_vapor_pressure_deficit        <dbl> 17, 15, 12, 10, 7, 7, 5, 5, 5, 5, 5,…
## $ pct_soil_temperature_0_to_7cm     <dbl> 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 2, 2, …
## $ pct_soil_temperature_7_to_28cm    <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, …
## $ pct_soil_temperature_28_to_100cm  <dbl> 18, 18, 18, 18, 18, 18, 18, 18, 18, …
## $ pct_soil_temperature_100_to_255cm <dbl> 40, 40, 40, 40, 40, 40, 40, 40, 40, …
## $ pct_soil_moisture_0_to_7cm        <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, …
## $ pct_soil_moisture_7_to_28cm       <dbl> 84, 84, 84, 84, 84, 84, 84, 84, 84, …
## $ pct_soil_moisture_28_to_100cm     <dbl> 99, 99, 99, 99, 99, 99, 99, 98, 98, …
## $ pct_soil_moisture_100_to_255cm    <dbl> 85, 85, 85, 85, 86, 86, 86, 86, 86, …
## $ pct_year                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

## # A tibble: 8 × 4
##   todSeason    season tod       n
##   <fct>        <fct>  <fct> <int>
## 1 Spring-Day   Spring Day   15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day   Summer Day   15456
## 4 Summer-Night Summer Night 15456
## 5 Fall-Day     Fall   Day   15288
## 6 Fall-Night   Fall   Night 15288
## 7 Winter-Day   Winter Day   15156
## 8 Winter-Night Winter Night 15156
## # A tibble: 24 × 4
##     hour fct_hour tod       n
##    <int> <fct>    <fct> <int>
##  1     0 0        Night  5113
##  2     1 1        Night  5113
##  3     2 2        Night  5113
##  4     3 3        Night  5113
##  5     4 4        Night  5113
##  6     5 5        Night  5113
##  7     6 6        Night  5113
##  8     7 7        Day    5113
##  9     8 8        Day    5113
## 10     9 9        Day    5113
## 11    10 10       Day    5113
## 12    11 11       Day    5113
## 13    12 12       Day    5113
## 14    13 13       Day    5113
## 15    14 14       Day    5113
## 16    15 15       Day    5113
## 17    16 16       Day    5113
## 18    17 17       Day    5113
## 19    18 18       Day    5113
## 20    19 19       Night  5113
## 21    20 20       Night  5113
## 22    21 21       Night  5113
## 23    22 22       Night  5113
## 24    23 23       Night  5113
## # A tibble: 12 × 3
##    month season     n
##    <fct> <fct>  <int>
##  1 Jan   Winter 10416
##  2 Feb   Winter  9480
##  3 Mar   Spring 10416
##  4 Apr   Spring 10080
##  5 May   Spring 10416
##  6 Jun   Summer 10080
##  7 Jul   Summer 10416
##  8 Aug   Summer 10416
##  9 Sep   Fall   10080
## 10 Oct   Fall   10416
## 11 Nov   Fall   10080
## 12 Dec   Winter 10416
houTemp <- formatOpenMeteoJSON("testOM_hourly_hou.json", addVars=TRUE)
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly 
## 
## $tblDaily
## NULL
## 
## $tblHourly
## # A tibble: 122,712 × 37
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2010-01-01 00:00:00 2010-01-01     0           10.9                  93
##  2 2010-01-01 01:00:00 2010-01-01     1            9.9                  92
##  3 2010-01-01 02:00:00 2010-01-01     2            8.6                  88
##  4 2010-01-01 03:00:00 2010-01-01     3            7.7                  86
##  5 2010-01-01 04:00:00 2010-01-01     4            7.2                  85
##  6 2010-01-01 05:00:00 2010-01-01     5            6.8                  84
##  7 2010-01-01 06:00:00 2010-01-01     6            6.4                  82
##  8 2010-01-01 07:00:00 2010-01-01     7            5.9                  83
##  9 2010-01-01 08:00:00 2010-01-01     8            5.6                  83
## 10 2010-01-01 09:00:00 2010-01-01     9            5.5                  82
## # ℹ 122,702 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $tblUnits
## # A tibble: 34 × 4
##    metricType   name                 value   description                        
##    <chr>        <chr>                <chr>   <chr>                              
##  1 hourly_units time                 iso8601 <NA>                               
##  2 hourly_units temperature_2m       deg C   Air temperature at 2 meters above …
##  3 hourly_units relativehumidity_2m  %       Relative humidity at 2 meters abov…
##  4 hourly_units dewpoint_2m          deg C   Dew point temperature at 2 meters …
##  5 hourly_units apparent_temperature deg C   Apparent temperature is the percei…
##  6 hourly_units pressure_msl         hPa     Atmospheric air pressure reduced t…
##  7 hourly_units surface_pressure     hPa     Atmospheric air pressure reduced t…
##  8 hourly_units precipitation        mm      Total precipitation (rain, showers…
##  9 hourly_units rain                 mm      Only liquid precipitation of the p…
## 10 hourly_units snowfall             cm      Snowfall amount of the preceding h…
## # ℹ 24 more rows
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone  
##      <dbl>     <dbl>             <dbl>              <int> <chr>     
## 1     29.8     -95.4             3762.             -18000 US/Central
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 29.77153
## longitude: -95.43555
## generationtime_ms: 3762.283
## utc_offset_seconds: -18000
## timezone: US/Central
## timezone_abbreviation: CDT
## elevation: 17
## 
## Rows: 122,712
## Columns: 80
## $ time                              <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date                              <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour                              <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m                    <dbl> 10.9, 9.9, 8.6, 7.7, 7.2, 6.8, 6.4, …
## $ relativehumidity_2m               <int> 93, 92, 88, 86, 85, 84, 82, 83, 83, …
## $ dewpoint_2m                       <dbl> 9.8, 8.6, 6.7, 5.6, 4.8, 4.2, 3.6, 3…
## $ apparent_temperature              <dbl> 7.4, 5.7, 4.1, 3.2, 2.9, 2.4, 2.2, 1…
## $ pressure_msl                      <dbl> 1025.2, 1025.9, 1026.8, 1027.1, 1027…
## $ surface_pressure                  <dbl> 1023.1, 1023.8, 1024.7, 1025.0, 1025…
## $ precipitation                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rain                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ snowfall                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover                        <int> 90, 90, 88, 88, 89, 89, 86, 80, 90, …
## $ cloudcover_low                    <int> 100, 100, 98, 98, 99, 99, 96, 89, 10…
## $ cloudcover_mid                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover_high                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shortwave_radiation               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 89, 1…
## $ direct_radiation                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 28, 58…
## $ direct_normal_irradiance          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 61, 1…
## $ windspeed_10m                     <dbl> 24.0, 25.9, 25.3, 23.5, 20.9, 20.7, …
## $ windspeed_100m                    <dbl> 37.4, 39.1, 38.4, 35.4, 32.0, 31.2, …
## $ winddirection_10m                 <int> 330, 333, 336, 339, 341, 340, 347, 3…
## $ winddirection_100m                <int> 332, 334, 337, 341, 343, 341, 347, 3…
## $ windgusts_10m                     <dbl> 44.3, 46.1, 46.8, 44.3, 41.0, 37.8, …
## $ et0_fao_evapotranspiration        <dbl> 0.00, 0.01, 0.01, 0.01, 0.02, 0.02, …
## $ weathercode                       <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, …
## $ vapor_pressure_deficit            <dbl> 0.10, 0.10, 0.14, 0.14, 0.16, 0.16, …
## $ soil_temperature_0_to_7cm         <dbl> 11.9, 11.5, 11.0, 10.5, 10.1, 9.8, 9…
## $ soil_temperature_7_to_28cm        <dbl> 12.3, 12.3, 12.2, 12.2, 12.1, 12.0, …
## $ soil_temperature_28_to_100cm      <dbl> 14.2, 14.2, 14.2, 14.2, 14.2, 14.2, …
## $ soil_temperature_100_to_255cm     <dbl> 20.9, 20.9, 20.9, 20.9, 20.9, 20.9, …
## $ soil_moisture_0_to_7cm            <dbl> 0.462, 0.462, 0.462, 0.462, 0.462, 0…
## $ soil_moisture_7_to_28cm           <dbl> 0.474, 0.474, 0.474, 0.474, 0.473, 0…
## $ soil_moisture_28_to_100cm         <dbl> 0.498, 0.498, 0.498, 0.498, 0.498, 0…
## $ soil_moisture_100_to_255cm        <dbl> 0.453, 0.453, 0.453, 0.453, 0.453, 0…
## $ origTime                          <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year                              <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month                             <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour                          <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod                               <fct> Night, Night, Night, Night, Night, N…
## $ doy                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season                            <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason                         <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour                          <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m                <dbl> 12, 10, 8, 6, 6, 5, 5, 4, 4, 4, 4, 5…
## $ pct_relativehumidity_2m           <dbl> 80, 77, 67, 63, 61, 59, 55, 57, 57, …
## $ pct_dewpoint_2m                   <dbl> 23, 21, 17, 15, 13, 12, 11, 10, 9, 9…
## $ pct_apparent_temperature          <dbl> 11, 9, 6, 5, 5, 4, 4, 4, 4, 3, 3, 4,…
## $ pct_pressure_msl                  <dbl> 92, 93, 94, 95, 96, 97, 97, 97, 97, …
## $ pct_surface_pressure              <dbl> 92, 93, 94, 95, 96, 97, 97, 97, 98, …
## $ pct_precipitation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_rain                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover                    <dbl> 80, 80, 79, 79, 79, 79, 78, 76, 80, …
## $ pct_cloudcover_low                <dbl> 89, 89, 87, 87, 88, 88, 86, 84, 89, …
## $ pct_cloudcover_mid                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover_high               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_shortwave_radiation           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 51, 59, 6…
## $ pct_direct_radiation              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 61, 6…
## $ pct_direct_normal_irradiance      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 54, 63, 6…
## $ pct_diffuse_radiation             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 60, 7…
## $ pct_windspeed_10m                 <dbl> 95, 97, 96, 94, 90, 89, 83, 79, 78, …
## $ pct_windspeed_100m                <dbl> 96, 97, 97, 95, 90, 89, 82, 78, 76, …
## $ pct_winddirection_10m             <dbl> 91, 92, 92, 93, 93, 93, 95, 98, 96, …
## $ pct_winddirection_100m            <dbl> 92, 92, 93, 94, 94, 94, 96, 99, 97, …
## $ pct_windgusts_10m                 <dbl> 94, 96, 96, 94, 91, 87, 87, 84, 77, …
## $ pct_et0_fao_evapotranspiration    <dbl> 0, 24, 24, 24, 32, 32, 32, 24, 24, 3…
## $ pct_weathercode                   <dbl> 69, 69, 69, 69, 69, 69, 69, 69, 69, …
## $ pct_vapor_pressure_deficit        <dbl> 10, 10, 16, 16, 19, 19, 20, 19, 19, …
## $ pct_soil_temperature_0_to_7cm     <dbl> 10, 9, 8, 7, 6, 6, 5, 4, 4, 4, 4, 5,…
## $ pct_soil_temperature_7_to_28cm    <dbl> 6, 6, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4, …
## $ pct_soil_temperature_28_to_100cm  <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ pct_soil_temperature_100_to_255cm <dbl> 38, 38, 38, 38, 38, 38, 38, 38, 38, …
## $ pct_soil_moisture_0_to_7cm        <dbl> 82, 82, 82, 82, 82, 82, 82, 82, 82, …
## $ pct_soil_moisture_7_to_28cm       <dbl> 88, 88, 88, 88, 88, 88, 88, 88, 88, …
## $ pct_soil_moisture_28_to_100cm     <dbl> 98, 98, 98, 98, 98, 98, 98, 98, 98, …
## $ pct_soil_moisture_100_to_255cm    <dbl> 82, 82, 82, 82, 82, 82, 82, 82, 82, …
## $ pct_year                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

## # A tibble: 8 × 4
##   todSeason    season tod       n
##   <fct>        <fct>  <fct> <int>
## 1 Spring-Day   Spring Day   15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day   Summer Day   15456
## 4 Summer-Night Summer Night 15456
## 5 Fall-Day     Fall   Day   15288
## 6 Fall-Night   Fall   Night 15288
## 7 Winter-Day   Winter Day   15156
## 8 Winter-Night Winter Night 15156
## # A tibble: 24 × 4
##     hour fct_hour tod       n
##    <int> <fct>    <fct> <int>
##  1     0 0        Night  5113
##  2     1 1        Night  5113
##  3     2 2        Night  5113
##  4     3 3        Night  5113
##  5     4 4        Night  5113
##  6     5 5        Night  5113
##  7     6 6        Night  5113
##  8     7 7        Day    5113
##  9     8 8        Day    5113
## 10     9 9        Day    5113
## 11    10 10       Day    5113
## 12    11 11       Day    5113
## 13    12 12       Day    5113
## 14    13 13       Day    5113
## 15    14 14       Day    5113
## 16    15 15       Day    5113
## 17    16 16       Day    5113
## 18    17 17       Day    5113
## 19    18 18       Day    5113
## 20    19 19       Night  5113
## 21    20 20       Night  5113
## 22    21 21       Night  5113
## 23    22 22       Night  5113
## 24    23 23       Night  5113
## # A tibble: 12 × 3
##    month season     n
##    <fct> <fct>  <int>
##  1 Jan   Winter 10416
##  2 Feb   Winter  9480
##  3 Mar   Spring 10416
##  4 Apr   Spring 10080
##  5 May   Spring 10416
##  6 Jun   Summer 10080
##  7 Jul   Summer 10416
##  8 Aug   Summer 10416
##  9 Sep   Fall   10080
## 10 Oct   Fall   10416
## 11 Nov   Fall   10080
## 12 Dec   Winter 10416

An integrated set of all-city test and train data is created:

# Bind all the data frames
allCity <- list("NYC"=nycTemp, 
                "LA"=laxTemp, 
                "Chicago"=chiTemp, 
                "Houston"=houTemp
                ) %>%
    bind_rows(.id="src")

# Create the index for training data
set.seed(24061512)
idxTrain <- sample(1:nrow(allCity), size = round(0.7*nrow(allCity)), replace=FALSE)

# Add test-train flag to full dataset
allCity <- allCity %>%
    mutate(tt=ifelse(row_number() %in% idxTrain, "train", "test"), 
           fct_src=factor(src))
allCity
## # A tibble: 486,072 × 83
##    src   time                date        hour temperature_2m relativehumidity_2m
##    <chr> <dttm>              <date>     <int>          <dbl>               <int>
##  1 NYC   2010-01-01 00:00:00 2010-01-01     0           -1.1                  95
##  2 NYC   2010-01-01 01:00:00 2010-01-01     1           -1                    96
##  3 NYC   2010-01-01 02:00:00 2010-01-01     2           -1                    96
##  4 NYC   2010-01-01 03:00:00 2010-01-01     3           -0.8                  97
##  5 NYC   2010-01-01 04:00:00 2010-01-01     4           -0.9                  97
##  6 NYC   2010-01-01 05:00:00 2010-01-01     5           -0.8                  97
##  7 NYC   2010-01-01 06:00:00 2010-01-01     6           -0.7                  97
##  8 NYC   2010-01-01 07:00:00 2010-01-01     7           -0.5                  97
##  9 NYC   2010-01-01 08:00:00 2010-01-01     8           -0.6                  97
## 10 NYC   2010-01-01 09:00:00 2010-01-01     9           -0.6                  97
## # ℹ 486,062 more rows
## # ℹ 77 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
# Review counts by year
allCity %>% 
    count(year, src, tt) %>% 
    pivot_wider(id_cols=c("src", "tt"), names_from="year", values_from="n")
## # A tibble: 8 × 16
##   src     tt    `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018`
##   <chr>   <chr>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
## 1 Chicago test    2555   2660   2671   2667   2612   2648   2550   2567   2648
## 2 Chicago train   6205   6100   6113   6093   6148   6112   6234   6193   6112
## 3 Houston test    2666   2562   2671   2621   2695   2639   2595   2688   2631
## 4 Houston train   6094   6198   6113   6139   6065   6121   6189   6072   6129
## 5 LA      test    2638   2653   2679   2591   2645   2634   2648   2579   2729
## 6 LA      train   6122   6107   6105   6169   6115   6126   6136   6181   6031
## 7 NYC     test    2644   2648   2579   2627   2645   2577   2603   2589   2618
## 8 NYC     train   6116   6112   6205   6133   6115   6183   6181   6171   6142
## # ℹ 5 more variables: `2019` <int>, `2020` <int>, `2021` <int>, `2022` <int>,
## #   `2023` <int>

Distributions of several key variables are explored:

keyVars <- c('temperature_2m', 
             'relativehumidity_2m', 
             'dewpoint_2m', 
             'shortwave_radiation', 
             'vapor_pressure_deficit', 
             'soil_temperature_28_to_100cm', 
             'soil_temperature_100_to_255cm', 
             'soil_moisture_28_to_100cm', 
             'soil_moisture_100_to_255cm'
             )

allCity %>%
    colSelector(vecSelect=c("src", keyVars)) %>%
    pivot_longer(cols=-c(src)) %>%
    ggplot(aes(x=src, y=value)) + 
    geom_boxplot(aes(fill=src)) + 
    facet_wrap(~name, scales="free_y") + 
    labs(x=NULL, y=NULL, title="Distribution of Key Metrics by City") + 
    scale_fill_discrete(NULL)

In addition, pair plots by city are create for several combinations of variables:

keyVars <- c('pressure_msl', 
             'surface_pressure', 
             'soil_temperature_100_to_255cm', 
             'soil_moisture_100_to_255cm'
             )

for(intCtr in 1:(length(keyVars)-1)) {
    for(intCtr2 in (intCtr+1):length(keyVars)) {
        p1 <- allCity %>%
            mutate(across(c("pressure_msl", "surface_pressure", "soil_temperature_100_to_255cm"), 
                          .fns=function(x) round(x*2)/2
                          ), 
                   soil_moisture_100_to_255cm=round(soil_moisture_100_to_255cm, 2)
                   ) %>%
            colSelector(vecSelect=c("src", keyVars[c(intCtr, intCtr2)])) %>%
            group_by(across(c("src", keyVars[c(intCtr, intCtr2)]))) %>%
            summarize(n=n(), .groups="drop") %>%
            ungroup() %>%
            ggplot(aes(x=get(keyVars[intCtr]), y=get(keyVars[intCtr2]))) + 
            geom_point(aes(color=src, size=n), alpha=0.25) + 
            labs(title="Distribution of Key Metrics by City", x=keyVars[intCtr], y=keyVars[intCtr2]) + 
            scale_size_continuous("# Obs")
        print(p1)
    }
}

The cities are well differentiated by several combinations, particularly surface pressure vs. MSL pressure

A full random forest model is run for predicting city using LA, NYC, and Chicago:

# Create set of relevant training variables
varsTrain <- allCity %>%
    select(starts_with("pct")) %>%
    names() %>%
    str_replace(pattern="pct_", replacement="")
varsTrain
##  [1] "hour"                          "temperature_2m"               
##  [3] "relativehumidity_2m"           "dewpoint_2m"                  
##  [5] "apparent_temperature"          "pressure_msl"                 
##  [7] "surface_pressure"              "precipitation"                
##  [9] "rain"                          "snowfall"                     
## [11] "cloudcover"                    "cloudcover_low"               
## [13] "cloudcover_mid"                "cloudcover_high"              
## [15] "shortwave_radiation"           "direct_radiation"             
## [17] "direct_normal_irradiance"      "diffuse_radiation"            
## [19] "windspeed_10m"                 "windspeed_100m"               
## [21] "winddirection_10m"             "winddirection_100m"           
## [23] "windgusts_10m"                 "et0_fao_evapotranspiration"   
## [25] "weathercode"                   "vapor_pressure_deficit"       
## [27] "soil_temperature_0_to_7cm"     "soil_temperature_7_to_28cm"   
## [29] "soil_temperature_28_to_100cm"  "soil_temperature_100_to_255cm"
## [31] "soil_moisture_0_to_7cm"        "soil_moisture_7_to_28cm"      
## [33] "soil_moisture_28_to_100cm"     "soil_moisture_100_to_255cm"   
## [35] "year"                          "doy"
keyLabel <- "predictions based on pre-2022 training data applied to 2022 holdout dataset"
keyCities <- c("NYC", "LA", "Chicago")

rfCity <- runFullRF(dfTrain=allCity %>% filter(tt=="train", year<2022, src %in% keyCities), 
                     yVar="fct_src", 
                     xVars=varsTrain, 
                     dfTest=allCity %>% filter(tt=="test", year==2022, src %in% keyCities), 
                     useLabel=keyLabel, 
                     useSub=stringr::str_to_sentence(keyLabel), 
                     returnData=TRUE
                     )
## Warning: Dropped unused factor level(s) in dependent variable: Houston.

## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%

Prediction accuracy is 100%, as expected given the significant differentiation. Houston is assessed for the city it is “most similar” to:

predictRF(rfCity$rf, df=allCity %>% filter(tt=="test", year==2022)) %>%
    plotConfusion(trueCol="fct_src", useSub=NULL, plotCont=FALSE)

Based on predictors in the three-city random forest, Houston is most similar to NYC. The full random forest model is updated, including Houston:

keyCities <- c("NYC", "LA", "Chicago", "Houston")
rfCity <- runFullRF(dfTrain=allCity %>% filter(tt=="train", year<2022, src %in% keyCities), 
                    yVar="fct_src", 
                    xVars=varsTrain, 
                    dfTest=allCity %>% filter(tt=="test", year==2022, src %in% keyCities), 
                    useLabel=keyLabel, 
                    useSub=stringr::str_to_sentence(keyLabel), 
                    returnData=TRUE
                    )

## 
## Accuracy of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 100%

Even with the similarities between NYC and Houston, there is sufficient differentiation in the predictors to drive 100% accuracy

A model is created to predict temperature for two cities:

keyCities <- c("NYC", "Chicago")
keyLabel <- "predictions based on pre-2022 training data applied to 2022 holdout dataset"
rfTemp2m <- runFullRF(dfTrain=allCity %>% filter(tt=="train", year<2022, src %in% keyCities), 
                      yVar="temperature_2m", 
                      xVars=c(varsTrain[!str_detect(varsTrain, "^temp|ature$")]), 
                      dfTest=allCity %>% filter(tt=="test", year==2022, src %in% keyCities), 
                      useLabel=keyLabel, 
                      useSub=stringr::str_to_sentence(keyLabel), 
                      isContVar=TRUE,
                      rndTo=-1L,
                      refXY=TRUE,
                      returnData=TRUE
                      )
## Growing trees.. Progress: 65%. Estimated remaining time: 16 seconds.

## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.41% (RMSE 0.86 vs. 11.16 null)
## `geom_smooth()` using formula = 'y ~ x'

Temperature predictions on holdout data for NYC and Chicago have R-squared over 99%. The model is applied to data from Houston and LA:

# Temperature predictions for LA
predTempLA <- predictRF(rfTemp2m$rf, df=allCity %>% filter(tt=="test", year==2022, src=="LA"))
reportAccuracy(predTempLA, trueCol="temperature_2m", reportR2=TRUE, useLabel="LA temperature predictions")
## 
## R-squared of LA temperature predictions is: 92.38% (RMSE 1.89 vs. 6.86 null)
plotConfusion(predTempLA, trueCol="temperature_2m", plotCont=TRUE, rndTo=0.5, refXY=TRUE, useSub="LA")
## `geom_smooth()` using formula = 'y ~ x'

# Temperature predictions for Houston
predTempHOU <- predictRF(rfTemp2m$rf, df=allCity %>% filter(tt=="test", year==2022, src=="Houston"))
reportAccuracy(predTempHOU, trueCol="temperature_2m", reportR2=TRUE, useLabel="Houston temperature predictions")
## 
## R-squared of Houston temperature predictions is: 97.22% (RMSE 1.44 vs. 8.63 null)
plotConfusion(predTempHOU, trueCol="temperature_2m", plotCont=TRUE, rndTo=0.5, refXY=TRUE, useSub="Houston")
## `geom_smooth()` using formula = 'y ~ x'

Predictions for two cities not included in the original model have ~95% R-squared. Houston being relatively similar to NYC has higher R-squared than LA

Function runFullRF() is updated to allow for using an existing model with new data:

runFullRF <- function(dfTrain, 
                      yVar, 
                      xVars, 
                      useExistingRF=NULL,
                      dfTest=dfTrain,
                      useLabel="test data",
                      useSub=NULL, 
                      isContVar=FALSE,
                      rndTo=NULL,
                      rndBucketsAuto=100,
                      nSig=NULL,
                      refXY=FALSE,
                      makePlots=TRUE,
                      plotImp=makePlots,
                      plotConf=makePlots,
                      returnData=FALSE, 
                      ...
                      ) {
    
    # FUNCTION ARGUMENTS:
    # dfTrain: training data
    # yVar: dependent variable
    # xVars: column(s) containing independent variables
    # useExistingRF: an existing RF model, meaning only steps 3-5 are run (default NULL means run all steps)
    # dfTest: test dataset for applying predictions
    # useLabel: label to be used for reporting accuracy
    # useSub: subtitle to be used for confusion chart (NULL means none)
    # isContVar: boolean, is the variable continuous? (default FALSE means categorical)
    # rndTo: every number in x should be rounded to the nearest rndTo
    #        NULL means no rounding (default)
    #        -1L means make an estimate based on data
    # rndBucketsAuto: integer, if rndTo is -1L, about how many buckets are desired for predictions?
    # nSig: number of significant digits for automatically calculated rounding parameter
    #       (NULL means calculate exactly)    
    # refXY: boolean, should a reference line for y=x be included? (relevant only for continuous)
    # makePlots: boolean, should plots be created for variable importance and confusion matrix?
    # plotImp: boolean, should variable importance be plotted? (default is makePlots)
    # plotConf: boolean, should confusion matrix be plotted? (default is makePlots)
    # returnData: boolean, should data be returned?
    # ...: additional parameters to pass to runSimpleRF(), which are then passed to ranger::ranger()

    # Create the RF and plot importances, unless an RF is passed
    if(is.null(useExistingRF)) {
        # 1. Run random forest using impurity for importance
        rf <- runSimpleRF(df=dfTrain, yVar=yVar, xVars=xVars, importance="impurity", ...)

        # 2. Create, and optionally plot, variable importance
        rfImp <- plotRFImportance(rf, plotData=plotImp, returnData=TRUE)
    }
    else {
        rf <- useExistingRF
        rfImp <- NA
    }

    # 3. Predict on test dataset
    tstPred <- predictRF(rf=rf, df=dfTest)

    # 4. Report on accuracy (updated for continuous or categorical)
    rfAcc <- reportAccuracy(tstPred, 
                            trueCol=yVar, 
                            rndReport=3, 
                            useLabel=useLabel, 
                            reportR2=isTRUE(isContVar),
                            returnAcc=TRUE
                            )

    # 5. Plot confusion data (updated for continuous vs. categorical) if requested
    if(isTRUE(plotConf)) {
        plotConfusion(tstPred, 
                      trueCol=yVar, 
                      useSub=useSub, 
                      plotCont=isTRUE(isContVar), 
                      rndTo=rndTo, 
                      rndBucketsAuto=rndBucketsAuto,
                      nSig=nSig,
                      refXY=refXY
                      )
    }
    
    #6. Return data if requested
    if(isTRUE(returnData)) return(list(rf=rf, rfImp=rfImp, tstPred=tstPred, rfAcc=rfAcc))
    
}

Updated function runFullRF() is tested on LA and Houston:

# Temperature predictions for LA
runFullRF(yVar="temperature_2m", 
          useExistingRF=rfTemp2m$rf, 
          dfTest=allCity %>% filter(tt=="test", year==2022, src=="LA"), 
          useLabel="LA temperature predictions", 
          useSub="LA", 
          isContVar=TRUE,
          rndTo=0.5, 
          refXY=TRUE
          )
## 
## R-squared of LA temperature predictions is: 92.382% (RMSE 1.89 vs. 6.86 null)
## `geom_smooth()` using formula = 'y ~ x'

# Temperature predictions for Houston
runFullRF(yVar="temperature_2m", 
          useExistingRF=rfTemp2m$rf, 
          dfTest=allCity %>% filter(tt=="test", year==2022, src=="Houston"), 
          useLabel="Houston temperature predictions", 
          useSub="Houston", 
          isContVar=TRUE,
          rndTo=0.5, 
          refXY=TRUE
          )
## 
## R-squared of Houston temperature predictions is: 97.223% (RMSE 1.44 vs. 8.63 null)
## `geom_smooth()` using formula = 'y ~ x'

A basic linear model can potentially drive better temperature predictions:

keyCities <- c("NYC", "Chicago")
lmMiniTemp <- allCity %>% 
    filter(tt=="train", year<2022, src %in% keyCities) %>%
    select(t=temperature_2m, d=dewpoint_2m, rh=relativehumidity_2m) %>%
    lm(t~rh+d+rh:d+1, data=.) 
summary(lmMiniTemp)
## 
## Call:
## lm(formula = t ~ rh + d + rh:d + 1, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8377 -0.4461 -0.1708  0.2944 12.0201 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  2.158e+01  7.965e-03  2709.77   <2e-16 ***
## rh          -2.300e-01  1.150e-04 -1999.27   <2e-16 ***
## d            1.087e+00  6.448e-04  1685.07   <2e-16 ***
## rh:d        -5.407e-04  9.068e-06   -59.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6296 on 147464 degrees of freedom
## Multiple R-squared:  0.9966, Adjusted R-squared:  0.9966 
## F-statistic: 1.428e+07 on 3 and 147464 DF,  p-value: < 2.2e-16
ggMiniTemp <- predict(lmMiniTemp, 
                      newdata=allCity %>% 
                          filter(tt=="test", year==2022, src %in% keyCities) %>% 
                          select(rh=relativehumidity_2m, d=dewpoint_2m)
                      ) %>% 
    mutate(select(allCity %>% filter(tt=="test", year==2022, src %in% keyCities), temperature_2m), 
           pred=., 
           err=pred-temperature_2m, 
           err2=err**2, 
           rnd5=round(temperature_2m/5)*5
           ) %>% 
    group_by(rnd5) %>% 
    summarize(n=n(), across(.cols=where(is.numeric), .fns=mean))
ggMiniTemp
## # A tibble: 13 × 6
##     rnd5     n temperature_2m    pred     err     err2
##    <dbl> <dbl>          <dbl>   <dbl>   <dbl>    <dbl>
##  1   -25     2        -23.3   -23.3   -0.0227  0.00534
##  2   -20    15        -19.4   -19.1    0.270   0.187  
##  3   -15    60        -14.5   -14.3    0.238   0.189  
##  4   -10   201         -9.76   -9.54   0.221   0.263  
##  5    -5   377         -4.52   -4.35   0.177   0.245  
##  6     0   648          0.202   0.184 -0.0177  0.267  
##  7     5   730          4.95    4.97   0.0224  0.248  
##  8    10   719         10.2    10.1   -0.0591  0.302  
##  9    15   692         14.9    14.9   -0.0380  0.433  
## 10    20   920         20.1    20.2    0.0703  0.244  
## 11    25   654         24.7    24.6   -0.0569  1.12   
## 12    30   254         29.4    28.4   -0.984   3.56   
## 13    35    38         34.2    31.2   -2.99   12.9
ggMiniTemp %>% 
    select(rnd5, temperature_2m, pred) %>%
    pivot_longer(cols=-c(rnd5)) %>%
    ggplot(aes(x=rnd5, y=value)) + 
    geom_line(aes(group=name, 
                  color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
                  )
              ) + 
    labs(title="Actual vs. Predicted Temperature Using City Linear Model on Same City Holdout Data", 
         x="New city actual temperature (rounded to nearest 5)", 
         y="Average temperature for metric"
         ) + 
    scale_color_discrete("Metric") + 
    geom_abline(slope=1, intercept=0, lty=2)

Predictions can then be explored in cities not included in the original linear model, starting with Houston:

ggMiniTemp_hou <- predict(lmMiniTemp, 
                          newdata=allCity %>% 
                              filter(tt=="test", year==2022, src %in% c("Houston")) %>% 
                              select(rh=relativehumidity_2m, d=dewpoint_2m)
                          ) %>% 
    mutate(select(allCity %>% filter(tt=="test", year==2022, src %in% c("Houston")), temperature_2m), 
           pred=., 
           err=pred-temperature_2m, 
           err2=err**2, 
           rnd5=round(temperature_2m/5)*5
           ) %>% 
    group_by(rnd5) %>% 
    summarize(n=n(), across(.cols=where(is.numeric), .fns=mean))
ggMiniTemp_hou
## # A tibble: 11 × 6
##     rnd5     n temperature_2m   pred     err   err2
##    <dbl> <dbl>          <dbl>  <dbl>   <dbl>  <dbl>
##  1   -10     2         -7.95  -8.68  -0.730   0.600
##  2    -5    14         -4.25  -4.40  -0.150   0.428
##  3     0    38          0.429  0.608  0.179   0.222
##  4     5   197          5.21   5.29   0.0779  0.245
##  5    10   304          9.94   9.92  -0.0131  0.313
##  6    15   291         15.1   14.8   -0.292   0.700
##  7    20   507         20.3   20.0   -0.294   0.935
##  8    25   744         25.1   25.0   -0.0138  0.657
##  9    30   429         29.6   29.6    0.0214  1.12 
## 10    35   145         34.3   33.0   -1.30    2.92 
## 11    40     4         38.4   35.1   -3.34   11.3
ggMiniTemp_hou %>% 
    summarize(mse=sum(n*err2)/sum(n)) %>% 
    mutate(rmse=sqrt(mse))
## # A tibble: 1 × 2
##     mse  rmse
##   <dbl> <dbl>
## 1 0.850 0.922
ggMiniTemp_hou %>% 
    select(rnd5, temperature_2m, pred) %>%
    pivot_longer(cols=-c(rnd5)) %>%
    ggplot(aes(x=rnd5, y=value)) + 
    geom_line(aes(group=name, 
                  color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
                  )
              ) + 
    labs(title="Actual vs. Predicted Temperature Using City Linear Model on New City (Houston) Holdout Data", 
         x="New city (Houston) actual temperature (rounded to nearest 5)", 
         y="Average temperature for metric"
         ) + 
    scale_color_discrete("Metric") + 
    geom_abline(slope=1, intercept=0, lty=2)

The linear model is generally very accurate for Houston, with the exception of under-predicting the very highest temperatures. RMSE of temperature predictions is lowered to ~1 from ~1.5 observed using the random forest

Predictions are also explored in Los Angeles:

ggMiniTemp_lax <- predict(lmMiniTemp, 
                          newdata=allCity %>% 
                              filter(tt=="test", year==2022, src %in% c("LA")) %>% 
                              select(rh=relativehumidity_2m, d=dewpoint_2m)
                          ) %>% 
    mutate(select(allCity %>% filter(tt=="test", year==2022, src %in% c("LA")), temperature_2m), 
           pred=., 
           err=pred-temperature_2m, 
           err2=err**2, 
           rnd5=round(temperature_2m/5)*5
           ) %>% 
    group_by(rnd5) %>% 
    summarize(n=n(), across(.cols=where(is.numeric), .fns=mean))
ggMiniTemp_lax
## # A tibble: 10 × 6
##     rnd5     n temperature_2m   pred     err    err2
##    <dbl> <dbl>          <dbl>  <dbl>   <dbl>   <dbl>
##  1     0     6           1.1   0.935  -0.165   0.205
##  2     5   127           5.72  5.52   -0.201   1.10 
##  3    10   605          10.2   9.20   -1.01    5.02 
##  4    15   754          15.1  13.9    -1.21    7.89 
##  5    20   585          19.7  17.5    -2.15   20.7  
##  6    25   331          24.7  22.1    -2.62   28.3  
##  7    30   176          29.7  24.2    -5.52   55.0  
##  8    35    49          34.4  25.9    -8.47   94.9  
##  9    40     7          38.9  28.6   -10.2   124.   
## 10    45     1          42.7  23.8   -18.9   356.
ggMiniTemp_lax %>% 
    summarize(mse=sum(n*err2)/sum(n)) %>% 
    mutate(rmse=sqrt(mse))
## # A tibble: 1 × 2
##     mse  rmse
##   <dbl> <dbl>
## 1  17.5  4.18
ggMiniTemp_lax %>% 
    select(rnd5, temperature_2m, pred) %>%
    pivot_longer(cols=-c(rnd5)) %>%
    ggplot(aes(x=rnd5, y=value)) + 
    geom_line(aes(group=name, 
                  color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
                  )
              ) + 
    labs(title="Actual vs. Predicted Temperature Using City Linear Model on New City (LA) Holdout Data", 
         x="New city (LA) actual temperature (rounded to nearest 5)", 
         y="Average temperature for metric"
         ) + 
    scale_color_discrete("Metric") + 
    geom_abline(slope=1, intercept=0, lty=2)

The linear model is generally inaccurate for LA, consistently underestimating temperatures. RMSE of temperature predictions is raised to ~4 from ~2 observed using the random forest

Los Angeles is meaningfully different from NYC and Chicago on key predictors:

tmpPlotData <- allCity %>% 
    select(src, relativehumidity_2m, dewpoint_2m, temperature_2m) %>% 
    mutate(across(where(is.numeric), .fns=round)) %>% 
    count(src, relativehumidity_2m, dewpoint_2m, temperature_2m)

tmpPlotData %>%
    count(src, temperature_2m, dewpoint_2m, wt=n) %>%
    ggplot(aes(x=temperature_2m, y=dewpoint_2m)) + 
    geom_point(aes(color=src, size=n), alpha=0.2) + 
    geom_smooth(aes(color=src, weight=n), method="lm") +
    labs(title="T/D by city")
## `geom_smooth()` using formula = 'y ~ x'

tmpPlotData %>%
    count(src, temperature_2m, relativehumidity_2m, wt=n) %>%
    ggplot(aes(x=temperature_2m, y=relativehumidity_2m)) + 
    geom_point(aes(color=src, size=n), alpha=0.1) + 
    geom_smooth(aes(color=src, weight=n), method="lm") +
    labs(title="T/RH by city")
## `geom_smooth()` using formula = 'y ~ x'

Los Angeles is routinely hot and arid, while the other cities tend to be humid when they are hot. Data for an additional low-humidity city are downloaded, cached to avoid multiple hits to the server:

# Hourly data download for Las Vegas, NV
testURLHourly <- helperOpenMeteoURL(cityName="Las Vegas NV", 
                                    hourlyIndices=1:nrow(tblMetricsHourly),
                                    startDate="2010-01-01", 
                                    endDate="2023-12-31", 
                                    tz="America/Los_Angeles"
                                    )
## 
## Hourly metrics created from indices: temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm
testURLHourly
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=36.21&longitude=-115.22&start_date=2010-01-01&end_date=2023-12-31&hourly=temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl,surface_pressure,precipitation,rain,snowfall,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high,shortwave_radiation,direct_radiation,direct_normal_irradiance,diffuse_radiation,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,et0_fao_evapotranspiration,weathercode,vapor_pressure_deficit,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&timezone=America%2FLos_Angeles"
# Download file
if(!file.exists("testOM_hourly_las.json")) {
    fileDownload(fileName="testOM_hourly_las.json", url=testURLHourly)
} else {
    cat("\nFile testOM_hourly_las.json already exists, skipping download\n")
}
## 
## File testOM_hourly_las.json already exists, skipping download
# Daily data download for Las Vegas, NV
testURLDaily <- helperOpenMeteoURL(cityName="Las Vegas NV", 
                                   dailyIndices=1:nrow(tblMetricsDaily),
                                   startDate="2010-01-01", 
                                   endDate="2023-12-31", 
                                   tz="America/Los_Angeles"
                                   )
## 
## Daily metrics created from indices: weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
testURLDaily
## [1] "https://archive-api.open-meteo.com/v1/archive?latitude=36.21&longitude=-115.22&start_date=2010-01-01&end_date=2023-12-31&daily=weathercode,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,sunrise,sunset,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration&timezone=America%2FLos_Angeles"
# Download file
if(!file.exists("testOM_daily_las.json")) {
    fileDownload(fileName="testOM_daily_las.json", url=testURLDaily)
} else {
    cat("\nFile testOM_daily_las.json already exists, skipping download\n")
}
## 
## File testOM_daily_las.json already exists, skipping download

The daily and hourly datasets are loaded:

# Read daily JSON file
lasOMDaily <- formatOpenMeteoJSON("testOM_daily_las.json")
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, daily_units, daily 
## 
## $tblDaily
## # A tibble: 5,113 × 18
##    date       time       weathercode temperature_2m_max temperature_2m_min
##    <date>     <chr>            <int>              <dbl>              <dbl>
##  1 2010-01-01 2010-01-01           2               10.3               -1.3
##  2 2010-01-02 2010-01-02           0               14.2               -0.4
##  3 2010-01-03 2010-01-03           0               14.2                0.7
##  4 2010-01-04 2010-01-04           1               13.3                2.8
##  5 2010-01-05 2010-01-05           1               13.6                0.7
##  6 2010-01-06 2010-01-06           1               15.8                2.5
##  7 2010-01-07 2010-01-07           2               16.1                6  
##  8 2010-01-08 2010-01-08           1               11.2                1.2
##  9 2010-01-09 2010-01-09           1               13.2                0.5
## 10 2010-01-10 2010-01-10           2               15.6                5.9
## # ℹ 5,103 more rows
## # ℹ 13 more variables: apparent_temperature_max <dbl>,
## #   apparent_temperature_min <dbl>, precipitation_sum <dbl>, rain_sum <dbl>,
## #   snowfall_sum <dbl>, precipitation_hours <dbl>, sunrise <chr>, sunset <chr>,
## #   windspeed_10m_max <dbl>, windgusts_10m_max <dbl>,
## #   winddirection_10m_dominant <int>, shortwave_radiation_sum <dbl>,
## #   et0_fao_evapotranspiration <dbl>
## 
## $tblHourly
## NULL
## 
## $tblUnits
## # A tibble: 17 × 4
##    metricType  name                       value      description                
##    <chr>       <chr>                      <chr>      <chr>                      
##  1 daily_units time                       "iso8601"  <NA>                       
##  2 daily_units weathercode                "wmo code" The most severe weather co…
##  3 daily_units temperature_2m_max         "deg C"    Maximum and minimum daily …
##  4 daily_units temperature_2m_min         "deg C"    Maximum and minimum daily …
##  5 daily_units apparent_temperature_max   "deg C"    Maximum and minimum daily …
##  6 daily_units apparent_temperature_min   "deg C"    Maximum and minimum daily …
##  7 daily_units precipitation_sum          "mm"       Sum of daily precipitation…
##  8 daily_units rain_sum                   "mm"       Sum of daily rain          
##  9 daily_units snowfall_sum               "cm"       Sum of daily snowfall      
## 10 daily_units precipitation_hours        "h"        The number of hours with r…
## 11 daily_units sunrise                    "iso8601"  Sun rise and set times     
## 12 daily_units sunset                     "iso8601"  Sun rise and set times     
## 13 daily_units windspeed_10m_max          "km/h"     Maximum wind speed and gus…
## 14 daily_units windgusts_10m_max          "km/h"     Maximum wind speed and gus…
## 15 daily_units winddirection_10m_dominant "deg "     Dominant wind direction    
## 16 daily_units shortwave_radiation_sum    "MJ/m²"    The sum of solar radiaion …
## 17 daily_units et0_fao_evapotranspiration "mm"       Daily sum of ET0 Reference…
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone           
##      <dbl>     <dbl>             <dbl>              <int> <chr>              
## 1     36.2     -115.              69.8             -25200 America/Los_Angeles
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 36.23901
## longitude: -115.1625
## generationtime_ms: 69.77499
## utc_offset_seconds: -25200
## timezone: America/Los_Angeles
## timezone_abbreviation: PDT
## elevation: 686
# Read hourly JSON file
lasTemp <- formatOpenMeteoJSON("testOM_hourly_las.json", addVars=TRUE)
## 
## Objects in JSON include: latitude, longitude, generationtime_ms, utc_offset_seconds, timezone, timezone_abbreviation, elevation, hourly_units, hourly 
## 
## $tblDaily
## NULL
## 
## $tblHourly
## # A tibble: 122,712 × 37
##    time                date        hour temperature_2m relativehumidity_2m
##    <dttm>              <date>     <int>          <dbl>               <int>
##  1 2010-01-01 00:00:00 2010-01-01     0            1.3                  53
##  2 2010-01-01 01:00:00 2010-01-01     1            0.5                  56
##  3 2010-01-01 02:00:00 2010-01-01     2            0.1                  56
##  4 2010-01-01 03:00:00 2010-01-01     3           -0.3                  57
##  5 2010-01-01 04:00:00 2010-01-01     4           -0.8                  59
##  6 2010-01-01 05:00:00 2010-01-01     5           -1.1                  60
##  7 2010-01-01 06:00:00 2010-01-01     6           -1.3                  60
##  8 2010-01-01 07:00:00 2010-01-01     7           -1.2                  58
##  9 2010-01-01 08:00:00 2010-01-01     8           -1.2                  56
## 10 2010-01-01 09:00:00 2010-01-01     9           -0.1                  56
## # ℹ 122,702 more rows
## # ℹ 32 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
## 
## $tblUnits
## # A tibble: 34 × 4
##    metricType   name                 value   description                        
##    <chr>        <chr>                <chr>   <chr>                              
##  1 hourly_units time                 iso8601 <NA>                               
##  2 hourly_units temperature_2m       deg C   Air temperature at 2 meters above …
##  3 hourly_units relativehumidity_2m  %       Relative humidity at 2 meters abov…
##  4 hourly_units dewpoint_2m          deg C   Dew point temperature at 2 meters …
##  5 hourly_units apparent_temperature deg C   Apparent temperature is the percei…
##  6 hourly_units pressure_msl         hPa     Atmospheric air pressure reduced t…
##  7 hourly_units surface_pressure     hPa     Atmospheric air pressure reduced t…
##  8 hourly_units precipitation        mm      Total precipitation (rain, showers…
##  9 hourly_units rain                 mm      Only liquid precipitation of the p…
## 10 hourly_units snowfall             cm      Snowfall amount of the preceding h…
## # ℹ 24 more rows
## 
## $tblDescription
## # A tibble: 1 × 7
##   latitude longitude generationtime_ms utc_offset_seconds timezone           
##      <dbl>     <dbl>             <dbl>              <int> <chr>              
## 1     36.2     -115.             7256.             -25200 America/Los_Angeles
## # ℹ 2 more variables: timezone_abbreviation <chr>, elevation <dbl>
## 
## 
## latitude: 36.23901
## longitude: -115.1625
## generationtime_ms: 7256.367
## utc_offset_seconds: -25200
## timezone: America/Los_Angeles
## timezone_abbreviation: PDT
## elevation: 686
## 
## Rows: 122,712
## Columns: 80
## $ time                              <dttm> 2010-01-01 00:00:00, 2010-01-01 01:…
## $ date                              <date> 2010-01-01, 2010-01-01, 2010-01-01,…
## $ hour                              <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ temperature_2m                    <dbl> 1.3, 0.5, 0.1, -0.3, -0.8, -1.1, -1.…
## $ relativehumidity_2m               <int> 53, 56, 56, 57, 59, 60, 60, 58, 56, …
## $ dewpoint_2m                       <dbl> -7.2, -7.3, -7.6, -7.7, -7.8, -7.9, …
## $ apparent_temperature              <dbl> -2.5, -3.3, -3.6, -4.1, -4.3, -4.7, …
## $ pressure_msl                      <dbl> 1031.2, 1031.1, 1030.8, 1031.7, 1031…
## $ surface_pressure                  <dbl> 947.4, 947.1, 946.7, 947.4, 946.9, 9…
## $ precipitation                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ rain                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ snowfall                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover                        <int> 12, 12, 12, 12, 12, 9, 11, 6, 3, 19,…
## $ cloudcover_low                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cloudcover_mid                    <int> 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 28,…
## $ cloudcover_high                   <int> 40, 40, 40, 39, 40, 29, 32, 19, 10, …
## $ shortwave_radiation               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 76, 240, …
## $ direct_radiation                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 50, 185, …
## $ direct_normal_irradiance          <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0…
## $ diffuse_radiation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 55, 6…
## $ windspeed_10m                     <dbl> 5.0, 5.5, 4.7, 4.9, 3.1, 3.5, 3.4, 3…
## $ windspeed_100m                    <dbl> 5.7, 7.2, 6.9, 6.5, 6.3, 6.0, 6.9, 6…
## $ winddirection_10m                 <int> 291, 293, 293, 287, 291, 294, 302, 2…
## $ winddirection_100m                <int> 342, 342, 351, 354, 24, 17, 6, 6, 35…
## $ windgusts_10m                     <dbl> 9.7, 10.1, 10.1, 9.7, 9.0, 9.0, 9.0,…
## $ et0_fao_evapotranspiration        <dbl> 0.01, 0.01, 0.01, 0.01, 0.00, 0.00, …
## $ weathercode                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ vapor_pressure_deficit            <dbl> 0.31, 0.28, 0.27, 0.26, 0.24, 0.23, …
## $ soil_temperature_0_to_7cm         <dbl> 0.0, -0.3, -0.6, -0.8, -1.0, -1.1, -…
## $ soil_temperature_7_to_28cm        <dbl> 5.2, 5.1, 5.0, 4.9, 4.7, 4.6, 4.5, 4…
## $ soil_temperature_28_to_100cm      <dbl> 10.2, 10.2, 10.2, 10.2, 10.2, 10.2, …
## $ soil_temperature_100_to_255cm     <dbl> 21.3, 21.3, 21.3, 21.3, 21.3, 21.3, …
## $ soil_moisture_0_to_7cm            <dbl> 0.069, 0.069, 0.069, 0.069, 0.069, 0…
## $ soil_moisture_7_to_28cm           <dbl> 0.126, 0.126, 0.126, 0.126, 0.126, 0…
## $ soil_moisture_28_to_100cm         <dbl> 0.142, 0.142, 0.142, 0.142, 0.142, 0…
## $ soil_moisture_100_to_255cm        <dbl> 0.12, 0.12, 0.12, 0.12, 0.12, 0.12, …
## $ origTime                          <chr> "2010-01-01T00:00", "2010-01-01T01:0…
## $ year                              <dbl> 2010, 2010, 2010, 2010, 2010, 2010, …
## $ month                             <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, J…
## $ fct_hour                          <fct> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11…
## $ tod                               <fct> Night, Night, Night, Night, Night, N…
## $ doy                               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ season                            <fct> Winter, Winter, Winter, Winter, Wint…
## $ todSeason                         <fct> Winter-Night, Winter-Night, Winter-N…
## $ pct_hour                          <dbl> 0, 4, 8, 13, 17, 21, 25, 29, 33, 38,…
## $ pct_temperature_2m                <dbl> 2, 1, 1, 1, 1, 1, 0, 0, 0, 1, 4, 10,…
## $ pct_relativehumidity_2m           <dbl> 87, 88, 88, 89, 90, 91, 91, 90, 88, …
## $ pct_dewpoint_2m                   <dbl> 23, 22, 21, 21, 20, 20, 19, 18, 17, …
## $ pct_apparent_temperature          <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 10,…
## $ pct_pressure_msl                  <dbl> 99, 99, 99, 99, 99, 99, 99, 99, 99, …
## $ pct_surface_pressure              <dbl> 98, 98, 98, 98, 98, 98, 98, 98, 98, …
## $ pct_precipitation                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_rain                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_snowfall                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover                    <dbl> 65, 65, 65, 65, 65, 62, 64, 59, 54, …
## $ pct_cloudcover_low                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_cloudcover_mid                <dbl> 0, 0, 0, 0, 0, 0, 73, 0, 0, 0, 70, 8…
## $ pct_cloudcover_high               <dbl> 75, 75, 75, 74, 75, 71, 72, 68, 64, …
## $ pct_shortwave_radiation           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 64, 7…
## $ pct_direct_radiation              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 57, 66, 7…
## $ pct_direct_normal_irradiance      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 61, 74, 8…
## $ pct_diffuse_radiation             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 54, 59, 6…
## $ pct_windspeed_10m                 <dbl> 27, 33, 24, 27, 10, 14, 13, 11, 19, …
## $ pct_windspeed_100m                <dbl> 24, 33, 31, 29, 28, 26, 31, 31, 27, …
## $ pct_winddirection_10m             <dbl> 75, 76, 76, 74, 75, 76, 79, 77, 74, …
## $ pct_winddirection_100m            <dbl> 94, 94, 96, 97, 7, 5, 1, 1, 96, 93, …
## $ pct_windgusts_10m                 <dbl> 15, 17, 17, 15, 12, 12, 12, 12, 9, 1…
## $ pct_et0_fao_evapotranspiration    <dbl> 5, 5, 5, 5, 0, 0, 0, 0, 0, 10, 37, 5…
## $ pct_weathercode                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72,…
## $ pct_vapor_pressure_deficit        <dbl> 5, 4, 4, 4, 3, 3, 3, 3, 3, 4, 7, 19,…
## $ pct_soil_temperature_0_to_7cm     <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 2, 4, …
## $ pct_soil_temperature_7_to_28cm    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, …
## $ pct_soil_temperature_28_to_100cm  <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ pct_soil_temperature_100_to_255cm <dbl> 44, 44, 44, 44, 44, 44, 44, 44, 44, …
## $ pct_soil_moisture_0_to_7cm        <dbl> 88, 88, 88, 88, 88, 88, 88, 88, 88, …
## $ pct_soil_moisture_7_to_28cm       <dbl> 75, 75, 75, 75, 75, 75, 75, 75, 75, …
## $ pct_soil_moisture_28_to_100cm     <dbl> 64, 64, 64, 64, 64, 64, 64, 64, 64, …
## $ pct_soil_moisture_100_to_255cm    <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, …
## $ pct_year                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pct_doy                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

## # A tibble: 8 × 4
##   todSeason    season tod       n
##   <fct>        <fct>  <fct> <int>
## 1 Spring-Day   Spring Day   15456
## 2 Spring-Night Spring Night 15456
## 3 Summer-Day   Summer Day   15456
## 4 Summer-Night Summer Night 15456
## 5 Fall-Day     Fall   Day   15288
## 6 Fall-Night   Fall   Night 15288
## 7 Winter-Day   Winter Day   15156
## 8 Winter-Night Winter Night 15156
## # A tibble: 24 × 4
##     hour fct_hour tod       n
##    <int> <fct>    <fct> <int>
##  1     0 0        Night  5113
##  2     1 1        Night  5113
##  3     2 2        Night  5113
##  4     3 3        Night  5113
##  5     4 4        Night  5113
##  6     5 5        Night  5113
##  7     6 6        Night  5113
##  8     7 7        Day    5113
##  9     8 8        Day    5113
## 10     9 9        Day    5113
## 11    10 10       Day    5113
## 12    11 11       Day    5113
## 13    12 12       Day    5113
## 14    13 13       Day    5113
## 15    14 14       Day    5113
## 16    15 15       Day    5113
## 17    16 16       Day    5113
## 18    17 17       Day    5113
## 19    18 18       Day    5113
## 20    19 19       Night  5113
## 21    20 20       Night  5113
## 22    21 21       Night  5113
## 23    22 22       Night  5113
## 24    23 23       Night  5113
## # A tibble: 12 × 3
##    month season     n
##    <fct> <fct>  <int>
##  1 Jan   Winter 10416
##  2 Feb   Winter  9480
##  3 Mar   Spring 10416
##  4 Apr   Spring 10080
##  5 May   Spring 10416
##  6 Jun   Summer 10080
##  7 Jul   Summer 10416
##  8 Aug   Summer 10416
##  9 Sep   Fall   10080
## 10 Oct   Fall   10416
## 11 Nov   Fall   10080
## 12 Dec   Winter 10416

An integrated set of all-city test and train data is updated:

# Bind all the data frames
allCity <- list("NYC"=nycTemp, 
                "LA"=laxTemp, 
                "Chicago"=chiTemp, 
                "Houston"=houTemp, 
                "Vegas"=lasTemp
                ) %>%
    bind_rows(.id="src")

# Create the index for training data
set.seed(24070113)
idxTrain_v2 <- sample(1:nrow(allCity), size = round(0.7*nrow(allCity)), replace=FALSE)

# Add test-train flag to full dataset
allCity <- allCity %>%
    mutate(tt=ifelse(row_number() %in% idxTrain_v2, "train", "test"), 
           fct_src=factor(src))
allCity
## # A tibble: 608,784 × 83
##    src   time                date        hour temperature_2m relativehumidity_2m
##    <chr> <dttm>              <date>     <int>          <dbl>               <int>
##  1 NYC   2010-01-01 00:00:00 2010-01-01     0           -1.1                  95
##  2 NYC   2010-01-01 01:00:00 2010-01-01     1           -1                    96
##  3 NYC   2010-01-01 02:00:00 2010-01-01     2           -1                    96
##  4 NYC   2010-01-01 03:00:00 2010-01-01     3           -0.8                  97
##  5 NYC   2010-01-01 04:00:00 2010-01-01     4           -0.9                  97
##  6 NYC   2010-01-01 05:00:00 2010-01-01     5           -0.8                  97
##  7 NYC   2010-01-01 06:00:00 2010-01-01     6           -0.7                  97
##  8 NYC   2010-01-01 07:00:00 2010-01-01     7           -0.5                  97
##  9 NYC   2010-01-01 08:00:00 2010-01-01     8           -0.6                  97
## 10 NYC   2010-01-01 09:00:00 2010-01-01     9           -0.6                  97
## # ℹ 608,774 more rows
## # ℹ 77 more variables: dewpoint_2m <dbl>, apparent_temperature <dbl>,
## #   pressure_msl <dbl>, surface_pressure <dbl>, precipitation <dbl>,
## #   rain <dbl>, snowfall <dbl>, cloudcover <int>, cloudcover_low <int>,
## #   cloudcover_mid <int>, cloudcover_high <int>, shortwave_radiation <dbl>,
## #   direct_radiation <dbl>, direct_normal_irradiance <dbl>,
## #   diffuse_radiation <dbl>, windspeed_10m <dbl>, windspeed_100m <dbl>, …
# Review counts by year
allCity %>% 
    count(year, src, tt) %>% 
    pivot_wider(id_cols=c("src", "tt"), names_from="year", values_from="n")
## # A tibble: 10 × 16
##    src     tt    `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018`
##    <chr>   <chr>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
##  1 Chicago test    2569   2593   2572   2660   2623   2591   2583   2679   2692
##  2 Chicago train   6191   6167   6212   6100   6137   6169   6201   6081   6068
##  3 Houston test    2687   2539   2612   2665   2675   2607   2652   2686   2662
##  4 Houston train   6073   6221   6172   6095   6085   6153   6132   6074   6098
##  5 LA      test    2565   2607   2588   2674   2627   2641   2685   2650   2655
##  6 LA      train   6195   6153   6196   6086   6133   6119   6099   6110   6105
##  7 NYC     test    2633   2602   2622   2623   2672   2583   2603   2607   2670
##  8 NYC     train   6127   6158   6162   6137   6088   6177   6181   6153   6090
##  9 Vegas   test    2582   2528   2642   2619   2633   2587   2650   2679   2618
## 10 Vegas   train   6178   6232   6142   6141   6127   6173   6134   6081   6142
## # ℹ 5 more variables: `2019` <int>, `2020` <int>, `2021` <int>, `2022` <int>,
## #   `2023` <int>

Distributions of several key variables are explored:

keyVars <- c('temperature_2m', 
             'relativehumidity_2m', 
             'dewpoint_2m', 
             'shortwave_radiation', 
             'vapor_pressure_deficit', 
             'soil_temperature_28_to_100cm', 
             'soil_temperature_100_to_255cm', 
             'soil_moisture_28_to_100cm', 
             'soil_moisture_100_to_255cm'
             )

allCity %>%
    colSelector(vecSelect=c("src", keyVars)) %>%
    pivot_longer(cols=-c(src)) %>%
    ggplot(aes(x=src, y=value)) + 
    geom_boxplot(aes(fill=src)) + 
    facet_wrap(~name, scales="free_y") + 
    labs(x=NULL, y=NULL, title="Distribution of Key Metrics by City") + 
    scale_fill_discrete(NULL)

Las Vegas stands out for especially low relative humidity (even relative to LA), as well as dry soil (similar to LA)

The scatter of temperature and dewpoint is also explored:

allCity %>% 
    select(t=temperature_2m, d=dewpoint_2m, src) %>% 
    mutate(across(.cols=where(is.numeric), .fns=function(x) round(x))) %>% 
    count(src, t, d) %>% 
    ggplot(aes(x=t, y=d)) + 
    geom_point(aes(size=n, color=src), alpha=0.5) + 
    geom_smooth(aes(color=src, weight=n), method="lm") +
    labs(x="Temperature (C)", y="Dewpoint (C)", title="Temperature vs. Dewpoint", subtitle="Hourly") + 
    scale_color_discrete(NULL) + 
    scale_size_continuous("# Obs")
## `geom_smooth()` using formula = 'y ~ x'

allCity %>% 
    group_by(src) %>%
    summarize(cor_td=cor(temperature_2m, dewpoint_2m))
## # A tibble: 5 × 2
##   src     cor_td
##   <chr>    <dbl>
## 1 Chicago  0.950
## 2 Houston  0.834
## 3 LA       0.273
## 4 NYC      0.919
## 5 Vegas    0.371

Las Vegas is similar to LA, with lower dewpoints. The more humid cities have 80%+ correlation between temperature and dewpoint, dropping to ~40% correlation in the drier cities

Models for predicting city (one with soil temperature, one without) are saved using data without Las Vegas, for application to the new Las Vegas data:

# Run with all variables
rfCityFull <- runFullRF(allCity %>% 
                            mutate(fct_src=factor(src)) %>% 
                            filter(year<2022, tt=="train", src!="Vegas"), 
                        yVar="fct_src", 
                        xVars=varsTrain, 
                        dfTest=allCity %>% 
                            mutate(fct_src=factor(src)) %>% 
                            filter(year==2022, tt=="test", src!="Vegas"), 
                        isContVar=FALSE, 
                        returnData=TRUE
                        )
## Warning: Dropped unused factor level(s) in dependent variable: Vegas.
## Growing trees.. Progress: 97%. Estimated remaining time: 0 seconds.

## 
## Accuracy of test data is: 100%

predictRF(rfCityFull$rf, df=allCity %>% filter(src=="Vegas")) %>% count(pred)
## # A tibble: 1 × 2
##   pred       n
##   <fct>  <int>
## 1 LA    122712
# Run without moisture variables
rfCityNoMoisture <- runFullRF(allCity %>% 
                                  mutate(fct_src=factor(src)) %>% 
                                  filter(year<2022, tt=="train", src!="Vegas"), 
                              yVar="fct_src", 
                              xVars=varsTrain[!grepl(pattern="moist", x=varsTrain)],
                              dfTest=allCity %>% 
                                  mutate(fct_src=factor(src)) %>% 
                                  filter(year==2022, tt=="test", src!="Vegas"), 
                              isContVar=FALSE, 
                              returnData=TRUE
                              )
## Warning: Dropped unused factor level(s) in dependent variable: Vegas.
## Growing trees.. Progress: 65%. Estimated remaining time: 16 seconds.

## 
## Accuracy of test data is: 98.725%

predictRF(rfCityNoMoisture$rf, df=houTemp) %>% count(pred)
## # A tibble: 1 × 2
##   pred         n
##   <fct>    <int>
## 1 Houston 122712

The previously trained random forest models overwhelmingly predict Las Vegas as Los Angeles (if soil moisture is included) or Houston (if soil moisture is excluded)

The linear approximation for estimating temperature based on dewpoint and relative humidity is applied:

ggMiniTempLAS <- predict(lmMiniTemp, 
                         newdata=allCity %>% 
                             filter(src=="Vegas", tt=="test", year==2022) %>%
                             select(rh=relativehumidity_2m, d=dewpoint_2m)
                         ) %>% 
    mutate(allCity %>% filter(src=="Vegas", tt=="test", year==2022) %>% select(temperature_2m), 
           pred=., 
           err=pred-temperature_2m, 
           err2=err**2, 
           rnd5=round(temperature_2m/5)*5
    ) %>% 
    group_by(rnd5) %>% 
    summarize(n=n(), across(.cols=where(is.numeric), .fns=mean))
ggMiniTempLAS
## # A tibble: 11 × 6
##     rnd5     n temperature_2m   pred     err     err2
##    <dbl> <dbl>          <dbl>  <dbl>   <dbl>    <dbl>
##  1    -5     1          -2.7  -2.95   -0.252   0.0633
##  2     0    48           1.09  0.469  -0.616   2.42  
##  3     5   264           5.52  3.97   -1.55    6.82  
##  4    10   406           9.96  6.39   -3.56   23.1   
##  5    15   345          14.7   8.61   -6.11   54.3   
##  6    20   294          20.1  11.4    -8.65   98.2   
##  7    25   370          25.2  16.8    -8.41  110.    
##  8    30   407          29.8  20.4    -9.37  142.    
##  9    35   274          34.8  22.6   -12.3   202.    
## 10    40   119          39.7  23.1   -16.6   298.    
## 11    45     9          43.5  22.8   -20.7   430.
ggMiniTempLAS %>% 
    summarize(mse=sum(n*err2)/sum(n)) %>% 
    mutate(rmse=sqrt(mse))
## # A tibble: 1 × 2
##     mse  rmse
##   <dbl> <dbl>
## 1  99.3  9.97
ggMiniTempLAS %>% 
    select(rnd5, temperature_2m, pred) %>%
    pivot_longer(cols=-c(rnd5)) %>%
    ggplot(aes(x=rnd5, y=value)) + 
    geom_line(aes(group=name, 
                  color=c("pred"="Predicted Mean", "temperature_2m"="Actual Mean")[name]
                  )
              ) + 
    labs(title="Actual vs. Predicted Temperature Using Old City Linear Model on New City Data", 
         x="New city actual temperature (rounded to nearest 5)", 
         y="Average temperature for metric"
         ) + 
    scale_color_discrete("Metric") + 
    geom_abline(slope=1, intercept=0, lty=2)

The linear approximation based on dewpoint and relative humidity is inaccurate for predicting temperatures in Las Vegas, consistent with Las Vegas having T/D trends very different from originally modeled cities, NYC and Chicago

Las Vegas is meaningfully different from NYC and Chicago on key predictors:

tmpPlotData <- allCity %>% 
    select(src, relativehumidity_2m, dewpoint_2m, temperature_2m) %>% 
    mutate(across(where(is.numeric), .fns=round)) %>% 
    count(src, relativehumidity_2m, dewpoint_2m, temperature_2m)

tmpPlotData %>%
    count(src, temperature_2m, dewpoint_2m, wt=n) %>%
    ggplot(aes(x=temperature_2m, y=dewpoint_2m)) + 
    geom_point(aes(color=src, size=n), alpha=0.2) + 
    geom_smooth(aes(color=src, weight=n), method="lm") +
    labs(title="T/D by city")
## `geom_smooth()` using formula = 'y ~ x'

tmpPlotData %>%
    count(src, temperature_2m, relativehumidity_2m, wt=n) %>%
    ggplot(aes(x=temperature_2m, y=relativehumidity_2m)) + 
    geom_point(aes(color=src, size=n), alpha=0.1) + 
    geom_smooth(aes(color=src, weight=n), method="lm") +
    labs(title="T/RH by city")
## `geom_smooth()` using formula = 'y ~ x'

The existing random forest model, trained on NYC and Chicago, is also tested on Las Vegas temperatures:

# Temperature predictions for Vegas
runFullRF(yVar="temperature_2m", 
          useExistingRF=rfTemp2m$rf, 
          dfTest=allCity %>% filter(tt=="test", year==2022, src=="Vegas"), 
          useLabel="Las Vegas temperature predictions", 
          useSub="Las Vegas", 
          isContVar=TRUE,
          rndTo=0.5, 
          refXY=TRUE
          )
## 
## R-squared of Las Vegas temperature predictions is: 90.29% (RMSE 3.32 vs. 10.65 null)
## `geom_smooth()` using formula = 'y ~ x'

The random forest is more accurate than the linear model in predicting temperatures in Las Vegas based on training data from other cities. RMSE is ~3 rather than the ~10 from the linear model application

All combinations of two variables are explored for predicting temperature on a smaller training dataset:

# Train and test data
dfTrainTemp <- allCity %>% 
    filter(!(src %in% c("Vegas")), tt=="train", year<2022) %>% 
    mutate(fct_src=factor(src))
dfTestTemp <- allCity %>% 
    filter(!(src %in% c("Vegas")), tt=="test", year==2022) %>% 
    mutate(fct_src=factor(src))

# Variables to explore
possTempVars <- c(varsTrain[!str_detect(varsTrain, "^temp|ature$")], "month", "tod")

# Subsets to use
set.seed(24070815)
idxSmallTemp <- sample(1:nrow(dfTrainTemp), 5000, replace=FALSE)
mtxSmallTemp <- matrix(nrow=0, ncol=3)

for(idx1 in 1:(length(possTempVars)-1)) {
    for(idx2 in (idx1+1):length(possTempVars)) {
        r2SmallTemp <- runFullRF(dfTrain=dfTrainTemp[idxSmallTemp,], 
                                 yVar="temperature_2m", 
                                 xVars=possTempVars[c(idx1, idx2)], 
                                 dfTest=dfTestTemp, 
                                 useLabel=keyLabel, 
                                 useSub=stringr::str_to_sentence(keyLabel), 
                                 isContVar=TRUE,
                                 makePlots=FALSE,
                                 returnData=TRUE
                                 )[["rfAcc"]][["r2"]]
        mtxSmallTemp <- rbind(mtxSmallTemp, c(idx1, idx2, r2SmallTemp))
    }
}
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.429% (RMSE 9.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.568% (RMSE 5.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.904% (RMSE 8.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.761% (RMSE 9.32 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.887% (RMSE 10.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.689% (RMSE 10.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.11% (RMSE 9.92 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.088% (RMSE 9.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.741% (RMSE 9.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.102% (RMSE 9.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.91% (RMSE 10.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.741% (RMSE 8.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.008% (RMSE 9.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.981% (RMSE 9.65 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.124% (RMSE 9.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.458% (RMSE 10.01 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.166% (RMSE 10.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.926% (RMSE 10.03 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.027% (RMSE 10.03 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.149% (RMSE 10.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.99% (RMSE 7.99 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.581% (RMSE 9.67 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.066% (RMSE 7.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.913% (RMSE 2.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.307% (RMSE 3.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.703% (RMSE 5.33 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.437% (RMSE 8.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.273% (RMSE 9.74 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.693% (RMSE 9.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.371% (RMSE 9.96 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.175% (RMSE 10.13 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.815% (RMSE 10.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.027% (RMSE 6.98 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.472% (RMSE 6.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.346% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.67% (RMSE 0.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.088% (RMSE 9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.674% (RMSE 9.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.583% (RMSE 10.27 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.53% (RMSE 10.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.545% (RMSE 10.05 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.577% (RMSE 10.11 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.095% (RMSE 10.08 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.496% (RMSE 9.84 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.226% (RMSE 10.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.277% (RMSE 9.52 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.385% (RMSE 9.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.7% (RMSE 9.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.058% (RMSE 9.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.35% (RMSE 10.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.799% (RMSE 10.36 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.856% (RMSE 10.36 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.208% (RMSE 10.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -4.26% (RMSE 10.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 37.901% (RMSE 8.2 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.266% (RMSE 9.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 98.932% (RMSE 1.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.996% (RMSE 2.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 83.519% (RMSE 4.22 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.807% (RMSE 5.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.043% (RMSE 8.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.491% (RMSE 10.06 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.247% (RMSE 9.96 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.563% (RMSE 10.21 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -8.445% (RMSE 10.83 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.944% (RMSE 10.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.033% (RMSE 6.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.427% (RMSE 6.71 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.112% (RMSE 10.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.703% (RMSE 6.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 74.493% (RMSE 5.25 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.963% (RMSE 5.89 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.674% (RMSE 6.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.516% (RMSE 6.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.282% (RMSE 5.48 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.68% (RMSE 5.53 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.17% (RMSE 5.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.591% (RMSE 6.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.957% (RMSE 4.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.145% (RMSE 4.75 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.794% (RMSE 4.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.678% (RMSE 5.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.79% (RMSE 6.08 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.682% (RMSE 6.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.451% (RMSE 5.93 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.143% (RMSE 5.96 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.739% (RMSE 6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.916% (RMSE 3.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.176% (RMSE 5.49 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.885% (RMSE 0.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.287% (RMSE 2.26 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 84.311% (RMSE 4.12 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.826% (RMSE 4.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.575% (RMSE 5.74 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.127% (RMSE 5.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.572% (RMSE 5.45 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.311% (RMSE 6.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.894% (RMSE 5.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.081% (RMSE 6.23 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.822% (RMSE 5.71 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.887% (RMSE 5.51 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.518% (RMSE 6.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.956% (RMSE 4.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.745% (RMSE 8.96 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.842% (RMSE 9.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.151% (RMSE 8.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.112% (RMSE 8.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.85% (RMSE 8.96 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.306% (RMSE 8.68 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.438% (RMSE 9.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 35.883% (RMSE 8.33 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.09% (RMSE 8.31 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.366% (RMSE 8.49 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.024% (RMSE 8.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.767% (RMSE 9.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.717% (RMSE 9.08 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.207% (RMSE 9.29 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.618% (RMSE 9.33 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.859% (RMSE 9.25 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.139% (RMSE 7.63 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.623% (RMSE 8.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.748% (RMSE 6.84 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.979% (RMSE 2.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.88% (RMSE 4.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.025% (RMSE 5.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.042% (RMSE 7.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.374% (RMSE 9.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.767% (RMSE 9.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.334% (RMSE 9.11 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.753% (RMSE 9.26 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.253% (RMSE 9.05 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.238% (RMSE 7.26 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.056% (RMSE 6.97 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.286% (RMSE 8.93 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.817% (RMSE 9.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.3% (RMSE 9.74 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.245% (RMSE 9.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.25% (RMSE 9.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.46% (RMSE 9.45 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.289% (RMSE 9.29 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.55% (RMSE 9.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.526% (RMSE 8.92 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.502% (RMSE 8.92 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.704% (RMSE 9.08 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.106% (RMSE 9.12 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.484% (RMSE 9.84 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.298% (RMSE 9.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.56% (RMSE 9.84 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.027% (RMSE 9.87 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.633% (RMSE 9.89 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.008% (RMSE 8.06 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.242% (RMSE 9.17 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.477% (RMSE 6.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.082% (RMSE 2.53 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.986% (RMSE 4.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.676% (RMSE 5.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 36.736% (RMSE 8.27 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.046% (RMSE 9.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.806% (RMSE 9.26 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.218% (RMSE 9.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.571% (RMSE 9.84 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.86% (RMSE 9.65 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.211% (RMSE 6.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.768% (RMSE 6.68 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.445% (RMSE 9.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.088% (RMSE 10.24 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.043% (RMSE 10.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.587% (RMSE 10.05 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.672% (RMSE 10.21 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.859% (RMSE 9.87 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.948% (RMSE 10.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.057% (RMSE 9.47 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.875% (RMSE 9.48 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.887% (RMSE 9.71 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.131% (RMSE 9.75 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.397% (RMSE 10.28 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.15% (RMSE 10.13 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.72% (RMSE 10.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.77% (RMSE 10.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.215% (RMSE 10.41 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.147% (RMSE 8.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.585% (RMSE 10.05 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.717% (RMSE 7.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.485% (RMSE 2.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.222% (RMSE 4.74 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.135% (RMSE 6.05 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.954% (RMSE 8.7 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.324% (RMSE 9.96 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.715% (RMSE 9.99 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.122% (RMSE 10.24 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.423% (RMSE 10.11 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.41% (RMSE 10.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.009% (RMSE 7.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.983% (RMSE 7.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.19% (RMSE 10.23 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.913% (RMSE 10.2 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.521% (RMSE 10.06 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.228% (RMSE 10.18 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.842% (RMSE 9.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.242% (RMSE 10.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.786% (RMSE 9.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.899% (RMSE 9.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.622% (RMSE 9.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.202% (RMSE 9.8 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.261% (RMSE 10.23 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.787% (RMSE 10.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.425% (RMSE 10.06 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.516% (RMSE 10.06 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.949% (RMSE 10.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.915% (RMSE 8.83 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.8% (RMSE 10.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.522% (RMSE 7.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.046% (RMSE 4.41 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.623% (RMSE 5.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 59.81% (RMSE 6.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.356% (RMSE 8.8 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.174% (RMSE 9.97 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.885% (RMSE 10.04 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.068% (RMSE 10.29 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.175% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.351% (RMSE 10.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.321% (RMSE 7.83 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.513% (RMSE 7.95 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.181% (RMSE 10.23 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.207% (RMSE 10.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.276% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.164% (RMSE 9.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.696% (RMSE 10.15 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.841% (RMSE 9.43 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.143% (RMSE 9.47 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.715% (RMSE 9.66 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.786% (RMSE 9.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.242% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.538% (RMSE 9.95 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.678% (RMSE 9.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.324% (RMSE 9.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.554% (RMSE 10.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.586% (RMSE 8.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.349% (RMSE 10.12 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.067% (RMSE 8.05 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.189% (RMSE 4.63 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.093% (RMSE 5.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 58.81% (RMSE 6.68 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.071% (RMSE 8.76 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.24% (RMSE 9.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.185% (RMSE 9.91 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.289% (RMSE 10.12 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.112% (RMSE 9.97 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.54% (RMSE 10.22 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.685% (RMSE 7.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.754% (RMSE 8.01 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.191% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.731% (RMSE 10.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.638% (RMSE 9.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.92% (RMSE 10.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.027% (RMSE 9.47 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.557% (RMSE 9.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.768% (RMSE 9.71 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.424% (RMSE 9.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.623% (RMSE 10.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.59% (RMSE 10.21 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.906% (RMSE 10.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.942% (RMSE 10.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.691% (RMSE 10.36 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.532% (RMSE 8.67 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.065% (RMSE 9.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.031% (RMSE 7.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.521% (RMSE 2.65 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.598% (RMSE 4.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.12% (RMSE 5.96 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.263% (RMSE 8.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.47% (RMSE 9.84 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.16% (RMSE 9.75 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.485% (RMSE 9.95 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.039% (RMSE 10.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.357% (RMSE 10.17 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.638% (RMSE 7.53 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.487% (RMSE 7.32 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.102% (RMSE 9.97 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.189% (RMSE 9.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.59% (RMSE 10.21 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.264% (RMSE 9.52 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.97% (RMSE 9.53 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.181% (RMSE 9.75 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.948% (RMSE 9.65 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.675% (RMSE 10.26 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.074% (RMSE 10.13 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.891% (RMSE 10.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.19% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.818% (RMSE 10.36 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.005% (RMSE 8.7 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.852% (RMSE 9.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.358% (RMSE 7.47 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.347% (RMSE 2.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.012% (RMSE 4.65 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.953% (RMSE 6.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.794% (RMSE 8.65 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.705% (RMSE 9.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.2% (RMSE 9.75 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.392% (RMSE 9.95 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.316% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.699% (RMSE 10.21 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.662% (RMSE 7.52 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.94% (RMSE 7.43 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.523% (RMSE 10.06 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.773% (RMSE 9.99 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.936% (RMSE 9.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.686% (RMSE 9.2 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.661% (RMSE 9.44 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.668% (RMSE 9.38 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.026% (RMSE 9.97 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.733% (RMSE 9.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.329% (RMSE 9.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.428% (RMSE 9.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.042% (RMSE 10.03 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.997% (RMSE 8.45 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.921% (RMSE 9.65 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.934% (RMSE 7.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.474% (RMSE 3.04 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.676% (RMSE 4.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.358% (RMSE 5.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.489% (RMSE 8.48 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.825% (RMSE 9.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.482% (RMSE 9.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.046% (RMSE 9.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.669% (RMSE 9.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.469% (RMSE 9.95 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.054% (RMSE 7.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.255% (RMSE 7.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.485% (RMSE 9.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.672% (RMSE 9.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.964% (RMSE 9.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.018% (RMSE 9.81 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.516% (RMSE 9.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.288% (RMSE 10.39 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.913% (RMSE 10.25 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.019% (RMSE 10.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.434% (RMSE 10.17 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -2.001% (RMSE 10.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.883% (RMSE 8.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.572% (RMSE 10 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.916% (RMSE 7.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.409% (RMSE 3.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 77.112% (RMSE 4.98 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.798% (RMSE 6.17 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.508% (RMSE 8.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.756% (RMSE 10.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.603% (RMSE 10.11 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.627% (RMSE 10.32 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.687% (RMSE 10.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.273% (RMSE 10.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.735% (RMSE 7.73 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.03% (RMSE 7.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.086% (RMSE 10.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.677% (RMSE 9.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.621% (RMSE 9.32 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.785% (RMSE 9.66 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.877% (RMSE 9.48 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.718% (RMSE 9.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.868% (RMSE 9.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.49% (RMSE 9.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.479% (RMSE 9.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.869% (RMSE 7.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.992% (RMSE 9.19 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.596% (RMSE 7.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.366% (RMSE 2.68 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.954% (RMSE 3.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.429% (RMSE 5.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.403% (RMSE 7.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.699% (RMSE 9.32 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 20.297% (RMSE 9.29 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.315% (RMSE 9.51 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.759% (RMSE 9.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.278% (RMSE 9.63 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.03% (RMSE 6.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.971% (RMSE 6.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.862% (RMSE 9.43 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.844% (RMSE 9.2 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.169% (RMSE 9.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.689% (RMSE 9.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.163% (RMSE 9.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.585% (RMSE 9.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.932% (RMSE 9.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.104% (RMSE 9.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.787% (RMSE 7.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 21.386% (RMSE 9.22 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.94% (RMSE 7.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.855% (RMSE 2.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 88.061% (RMSE 3.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.203% (RMSE 5.18 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.167% (RMSE 7.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.851% (RMSE 9.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.851% (RMSE 9.31 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.322% (RMSE 9.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.277% (RMSE 9.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.266% (RMSE 9.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 55.637% (RMSE 6.93 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.908% (RMSE 6.83 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.073% (RMSE 9.47 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.719% (RMSE 9.66 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.731% (RMSE 9.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.305% (RMSE 9.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.25% (RMSE 9.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.522% (RMSE 9.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.422% (RMSE 9.84 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.267% (RMSE 8.17 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.695% (RMSE 9.44 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.747% (RMSE 7.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.935% (RMSE 2.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 87.475% (RMSE 3.68 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.738% (RMSE 5.33 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 38.936% (RMSE 8.13 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.704% (RMSE 9.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 17.667% (RMSE 9.44 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.314% (RMSE 9.68 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.882% (RMSE 9.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.244% (RMSE 9.8 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.438% (RMSE 7.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.275% (RMSE 6.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.786% (RMSE 9.71 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.431% (RMSE 9.73 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.527% (RMSE 9.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.667% (RMSE 9.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.917% (RMSE 9.76 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.841% (RMSE 9.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.882% (RMSE 8.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 19.762% (RMSE 9.32 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 45.646% (RMSE 7.67 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.171% (RMSE 2.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 86.649% (RMSE 3.8 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.192% (RMSE 5.39 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.825% (RMSE 7.93 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.773% (RMSE 9.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.767% (RMSE 9.49 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 12.47% (RMSE 9.73 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.618% (RMSE 9.67 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.914% (RMSE 9.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.493% (RMSE 7.17 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.696% (RMSE 7 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.165% (RMSE 9.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.291% (RMSE 10.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -1.954% (RMSE 10.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.332% (RMSE 10.33 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.1% (RMSE 10.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 33.903% (RMSE 8.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.123% (RMSE 9.97 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.542% (RMSE 7.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.665% (RMSE 2.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.68% (RMSE 4.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.732% (RMSE 6.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 23.843% (RMSE 9.08 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.725% (RMSE 10.21 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.505% (RMSE 10.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.44% (RMSE 10.38 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -10.633% (RMSE 10.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.062% (RMSE 10.35 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.063% (RMSE 7.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.964% (RMSE 7.21 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.182% (RMSE 10.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.131% (RMSE 10.24 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.786% (RMSE 10.26 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.983% (RMSE 10.24 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.804% (RMSE 8.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.402% (RMSE 9.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 44.643% (RMSE 7.74 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.705% (RMSE 2.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.142% (RMSE 4.63 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.855% (RMSE 6.08 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 25.35% (RMSE 8.99 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.672% (RMSE 10.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.579% (RMSE 10.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 2.524% (RMSE 10.27 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -9.096% (RMSE 10.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.885% (RMSE 10.2 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.1% (RMSE 7.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.588% (RMSE 7.24 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.393% (RMSE 9.95 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.371% (RMSE 10.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -1.714% (RMSE 10.49 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.888% (RMSE 8.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.623% (RMSE 9.78 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.561% (RMSE 7.81 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.832% (RMSE 2.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.837% (RMSE 4.43 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.57% (RMSE 5.83 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.745% (RMSE 8.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.956% (RMSE 10.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.741% (RMSE 10.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.074% (RMSE 10.4 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 1.185% (RMSE 10.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.327% (RMSE 10.17 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.74% (RMSE 7.3 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 53.287% (RMSE 7.11 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.992% (RMSE 9.92 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.754% (RMSE 10.44 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.618% (RMSE 8.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 11.681% (RMSE 9.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.891% (RMSE 7.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.861% (RMSE 2.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 82.013% (RMSE 4.41 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 68.121% (RMSE 5.87 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.831% (RMSE 8.71 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.142% (RMSE 10.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.857% (RMSE 10.04 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.315% (RMSE 10.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.64% (RMSE 10.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.713% (RMSE 10.15 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 49.573% (RMSE 7.39 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.879% (RMSE 7.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.715% (RMSE 9.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.923% (RMSE 8.52 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 7.109% (RMSE 10.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.701% (RMSE 7.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.65% (RMSE 2.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 81.967% (RMSE 4.42 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.596% (RMSE 5.92 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.377% (RMSE 8.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.408% (RMSE 10.38 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.302% (RMSE 10.23 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 0.066% (RMSE 10.4 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -8.751% (RMSE 10.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -0.995% (RMSE 10.45 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.51% (RMSE 7.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.323% (RMSE 7.26 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 5.276% (RMSE 10.12 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.362% (RMSE 8.43 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.634% (RMSE 7.01 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.115% (RMSE 2.52 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.72% (RMSE 3.33 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.969% (RMSE 4.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.809% (RMSE 7.14 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.763% (RMSE 8.53 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 34.305% (RMSE 8.43 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 30.562% (RMSE 8.67 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.478% (RMSE 8.73 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 28.885% (RMSE 8.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.747% (RMSE 6.09 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.572% (RMSE 6.01 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.948% (RMSE 8.71 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.827% (RMSE 7.29 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 92.636% (RMSE 2.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.757% (RMSE 4.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 67.744% (RMSE 5.91 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.76% (RMSE 8.53 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 14.877% (RMSE 9.6 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 15.606% (RMSE 9.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 13.554% (RMSE 9.67 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.801% (RMSE 9.88 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 6.244% (RMSE 10.07 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.584% (RMSE 7.31 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.405% (RMSE 7.32 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 8.579% (RMSE 9.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.146% (RMSE 2.29 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 89.291% (RMSE 3.4 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.955% (RMSE 4.66 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.074% (RMSE 6.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.112% (RMSE 7.49 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.738% (RMSE 7.3 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.973% (RMSE 7.57 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 43.898% (RMSE 7.79 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.974% (RMSE 7.92 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.973% (RMSE 5.51 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 72.739% (RMSE 5.43 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 41.766% (RMSE 7.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.139% (RMSE 2.52 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.978% (RMSE 2.55 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.809% (RMSE 2.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.028% (RMSE 2.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.726% (RMSE 2.61 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.291% (RMSE 2.69 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.555% (RMSE 2.64 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 91.044% (RMSE 3.11 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.807% (RMSE 2.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 93.102% (RMSE 2.73 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.584% (RMSE 4.58 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.924% (RMSE 4.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.24% (RMSE 4.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.578% (RMSE 4.7 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.431% (RMSE 4.72 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.301% (RMSE 4.73 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 78.716% (RMSE 4.8 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.109% (RMSE 4.75 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 79.81% (RMSE 4.67 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 80.973% (RMSE 4.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 75.944% (RMSE 5.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.843% (RMSE 5.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.566% (RMSE 6.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 64.709% (RMSE 6.18 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 61.436% (RMSE 6.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 62.479% (RMSE 6.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 65.558% (RMSE 6.1 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 70.774% (RMSE 5.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 73.489% (RMSE 5.36 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 60.888% (RMSE 6.5 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.533% (RMSE 8.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.923% (RMSE 8.83 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 22.66% (RMSE 9.15 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 16.347% (RMSE 9.51 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 27.933% (RMSE 8.83 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 69.27% (RMSE 5.77 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 71.425% (RMSE 5.56 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 31.27% (RMSE 8.62 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 24.58% (RMSE 9.03 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 18.775% (RMSE 9.37 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 29.102% (RMSE 8.76 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.178% (RMSE 9.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 52.943% (RMSE 7.13 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 56.017% (RMSE 6.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.833% (RMSE 9.82 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.51% (RMSE 9.89 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 32.762% (RMSE 8.53 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 10.111% (RMSE 9.86 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 50.187% (RMSE 7.34 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 54.482% (RMSE 7.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.489% (RMSE 9.9 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 26.092% (RMSE 8.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -4.979% (RMSE 10.66 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 40.538% (RMSE 8.02 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 48.519% (RMSE 7.46 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 4.702% (RMSE 10.15 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: -3.641% (RMSE 10.59 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.967% (RMSE 5.98 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 66.189% (RMSE 6.05 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 9.567% (RMSE 9.89 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 46.239% (RMSE 7.63 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 47.407% (RMSE 7.54 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 3.068% (RMSE 10.24 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 51.415% (RMSE 7.25 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.973% (RMSE 7.85 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 42.357% (RMSE 7.9 vs. 10.4 null)

Predictive success by metric is explored:

dfSmallR2Temp <- as.data.frame(mtxSmallTemp) %>% 
    purrr::set_names(c("idx1", "idx2", "r2")) %>% 
    tibble::as_tibble() %>% 
    mutate(var1=possTempVars[idx1], var2=possTempVars[idx2], rn=row_number()) 
dfSmallR2Temp %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 630 × 3
##    var1                       var2                             r2
##    <chr>                      <chr>                         <dbl>
##  1 dewpoint_2m                vapor_pressure_deficit        0.999
##  2 relativehumidity_2m        dewpoint_2m                   0.997
##  3 relativehumidity_2m        vapor_pressure_deficit        0.989
##  4 dewpoint_2m                soil_temperature_0_to_7cm     0.953
##  5 vapor_pressure_deficit     soil_temperature_0_to_7cm     0.951
##  6 soil_temperature_0_to_7cm  soil_temperature_7_to_28cm    0.941
##  7 et0_fao_evapotranspiration soil_temperature_0_to_7cm     0.941
##  8 surface_pressure           soil_temperature_0_to_7cm     0.941
##  9 soil_temperature_0_to_7cm  soil_moisture_0_to_7cm        0.940
## 10 relativehumidity_2m        soil_temperature_0_to_7cm     0.940
## 11 pressure_msl               soil_temperature_0_to_7cm     0.940
## 12 soil_temperature_0_to_7cm  soil_temperature_28_to_100cm  0.940
## 13 direct_normal_irradiance   soil_temperature_0_to_7cm     0.939
## 14 hour                       soil_temperature_0_to_7cm     0.939
## 15 winddirection_100m         soil_temperature_0_to_7cm     0.939
## 16 direct_radiation           soil_temperature_0_to_7cm     0.939
## 17 winddirection_10m          soil_temperature_0_to_7cm     0.938
## 18 soil_temperature_0_to_7cm  soil_temperature_100_to_255cm 0.938
## 19 soil_temperature_0_to_7cm  doy                           0.938
## 20 soil_temperature_0_to_7cm  soil_moisture_7_to_28cm       0.937
## # ℹ 610 more rows
dfSmallR2Temp %>% 
    pivot_longer(cols=c(var1, var2)) %>% 
    group_by(value) %>% 
    summarize(across(r2, .fns=list("min"=min, "mu"=mean, "max"=max))) %>% 
    ggplot(aes(x=fct_reorder(value, r2_mu))) + 
    coord_flip() + 
    geom_point(aes(y=r2_mu)) + 
    geom_errorbar(aes(ymin=r2_min, ymax=r2_max)) + 
    lims(y=c(NA, 1)) + 
    geom_hline(yintercept=1, lty=2, color="red") +
    labs(title="R-squared in every 2-predictor model including self and one other", 
         subtitle="Predicting temperature", 
         y="Range of R-squared (min-mean-max)", 
         x=NULL
    )

dfSmallR2Temp %>% 
    arrange(desc(r2)) %>% 
    filter(var2!="soil_temperature_0_to_7cm", var1!="soil_temperature_0_to_7cm") %>% 
    select(var1, var2, r2) %>% 
    print(n=20)
## # A tibble: 595 × 3
##    var1                       var2                            r2
##    <chr>                      <chr>                        <dbl>
##  1 dewpoint_2m                vapor_pressure_deficit       0.999
##  2 relativehumidity_2m        dewpoint_2m                  0.997
##  3 relativehumidity_2m        vapor_pressure_deficit       0.989
##  4 et0_fao_evapotranspiration soil_temperature_7_to_28cm   0.897
##  5 vapor_pressure_deficit     soil_temperature_7_to_28cm   0.893
##  6 dewpoint_2m                et0_fao_evapotranspiration   0.889
##  7 hour                       soil_temperature_7_to_28cm   0.883
##  8 direct_radiation           soil_temperature_7_to_28cm   0.881
##  9 shortwave_radiation        soil_temperature_7_to_28cm   0.880
## 10 direct_normal_irradiance   soil_temperature_7_to_28cm   0.875
## 11 diffuse_radiation          soil_temperature_7_to_28cm   0.866
## 12 dewpoint_2m                soil_temperature_7_to_28cm   0.843
## 13 relativehumidity_2m        soil_temperature_7_to_28cm   0.835
## 14 winddirection_100m         soil_temperature_7_to_28cm   0.820
## 15 windgusts_10m              soil_temperature_7_to_28cm   0.820
## 16 winddirection_10m          soil_temperature_7_to_28cm   0.818
## 17 surface_pressure           soil_temperature_7_to_28cm   0.810
## 18 soil_temperature_7_to_28cm month                        0.810
## 19 soil_temperature_7_to_28cm soil_temperature_28_to_100cm 0.809
## 20 pressure_msl               soil_temperature_7_to_28cm   0.809
## # ℹ 575 more rows

Select combinations are explored using the full training dataset:

possLargeVars <- c("dewpoint_2m", 
                   "vapor_pressure_deficit", 
                   "relativehumidity_2m", 
                   "soil_temperature_0_to_7cm"
                   )
possLargeVars
## [1] "dewpoint_2m"               "vapor_pressure_deficit"   
## [3] "relativehumidity_2m"       "soil_temperature_0_to_7cm"
mtxLarge <- matrix(nrow=0, ncol=3)

for(idx1 in 1:(length(possLargeVars)-1)) {
    for(idx2 in (idx1+1):length(possLargeVars)) {
        r2LargeTemp <- runFullRF(dfTrain=dfTrainTemp[,], 
                                 yVar="temperature_2m", 
                                 xVars=possLargeVars[c(idx1, idx2)], 
                                 dfTest=dfTestTemp, 
                                 useLabel=keyLabel, 
                                 useSub=stringr::str_to_sentence(keyLabel), 
                                 isContVar=TRUE,
                                 makePlots=FALSE,
                                 returnData=TRUE
                                 )[["rfAcc"]][["r2"]]
        mtxLarge <- rbind(mtxLarge, c(idx1, idx2, r2LargeTemp))
    }
}
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.964% (RMSE 0.2 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.902% (RMSE 0.33 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.675% (RMSE 2.16 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 99.177% (RMSE 0.94 vs. 10.4 null)
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 95.59% (RMSE 2.18 vs. 10.4 null)
## Growing trees.. Progress: 91%. Estimated remaining time: 2 seconds.
## 
## R-squared of predictions based on pre-2022 training data applied to 2022 holdout dataset is: 94.588% (RMSE 2.42 vs. 10.4 null)
dfLargeR2Temp <- as.data.frame(mtxLarge) %>% 
    purrr::set_names(c("idx1", "idx2", "r2")) %>% 
    tibble::as_tibble() %>% 
    mutate(var1=possLargeVars[idx1], var2=possLargeVars[idx2], rn=row_number()) 
dfLargeR2Temp %>% arrange(desc(r2)) %>% select(var1, var2, r2) %>% print(n=20)
## # A tibble: 6 × 3
##   var1                   var2                         r2
##   <chr>                  <chr>                     <dbl>
## 1 dewpoint_2m            vapor_pressure_deficit    1.00 
## 2 dewpoint_2m            relativehumidity_2m       0.999
## 3 vapor_pressure_deficit relativehumidity_2m       0.992
## 4 dewpoint_2m            soil_temperature_0_to_7cm 0.957
## 5 vapor_pressure_deficit soil_temperature_0_to_7cm 0.956
## 6 relativehumidity_2m    soil_temperature_0_to_7cm 0.946

A model using only dewpoint and vapor pressure deficit is run on one city, then applied to the other:

# Train and test data
dfTrainTemp_v2 <- allCity %>% 
    filter(src %in% c("NYC"), tt=="train", year<2022) %>% 
    mutate(fct_src=factor(src))
dfTestTemp_v2 <- allCity %>% 
    filter(tt=="test", year==2022) %>% 
    mutate(fct_src=factor(src))

# Random forest for temperature using dewpoint and vapor pressure deficit
keyLabel <- "predictions based on NYC pre-2022 training data applied to each city in 2022 holdout dataset"
tmpPred_v2 <- runFullRF(dfTrain=dfTrainTemp_v2, 
                        yVar="temperature_2m", 
                        xVars=c("dewpoint_2m", "vapor_pressure_deficit"), 
                        dfTest=dfTestTemp_v2, 
                        useLabel=keyLabel, 
                        useSub=stringr::str_to_sentence(keyLabel), 
                        isContVar=TRUE,
                        makePlots=FALSE,
                        returnData=TRUE
                        )[["tstPred"]] %>%
    select(src, temperature_2m, pred) %>%
    group_by(src) %>%
    summarize(n=n(), 
              tss=sum((temperature_2m-mean(temperature_2m))**2), 
              rss=sum((temperature_2m-pred)**2), 
              r2=1-rss/tss, 
              rmse=sqrt(rss/n),
              berr=sqrt(tss/n)
              )
## 
## R-squared of predictions based on NYC pre-2022 training data applied to each city in 2022 holdout dataset is: 94.65% (RMSE 2.47 vs. 10.69 null)
tmpPred_v2
## # A tibble: 5 × 7
##   src         n     tss    rss    r2  rmse  berr
##   <chr>   <int>   <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Chicago  2592 356174.   305. 0.999 0.343 11.7 
## 2 Houston  2659 194789.   400. 0.998 0.388  8.56
## 3 LA       2677 127962.  6718. 0.947 1.58   6.91
## 4 NYC      2664 280171.   104. 1.00  0.197 10.3 
## 5 Vegas    2537 287670. 72697. 0.747 5.35  10.6

The model trained on NYC performs well on Chicago, Houston, and LA, while missing significantly on Las Vegas

Patterns in dewpoint and vapor pressure deficit are explored:

dfPlot_v2 <- dfTestTemp_v2 %>% 
    select(src, vapor_pressure_deficit, dewpoint_2m) %>% 
    mutate(across(where(is.numeric), .fns=function(x) round(2*x)/2)) %>% 
    count(src, vapor_pressure_deficit, dewpoint_2m) 

dfPlot_v2 %>% 
    ggplot(aes(y=vapor_pressure_deficit, x=dewpoint_2m)) + 
    geom_point(aes(color=src, size=n), alpha=0.25) + facet_wrap(~src) + 
    scale_color_discrete(NULL)

# Overlap of NYC points by city
tmpNYC <- dfTrainTemp_v2 %>% 
    select(src, vapor_pressure_deficit, dewpoint_2m) %>% 
    mutate(across(where(is.numeric), .fns=function(x) round(2*x)/2)) %>% 
    count(src, vapor_pressure_deficit, dewpoint_2m) %>%
    filter(src=="NYC", n>=10) %>%
    mutate(inNYC=TRUE)

dfPlot_v2 %>%
    left_join(select(tmpNYC, vapor_pressure_deficit, dewpoint_2m, inNYC), 
              by=c("vapor_pressure_deficit", "dewpoint_2m")
              ) %>%
    mutate(inNYC=ifelse(is.na(inNYC), FALSE, inNYC)) %>% 
    ggplot(aes(y=vapor_pressure_deficit, x=dewpoint_2m)) + 
    geom_point(aes(color=inNYC, size=n), alpha=0.25) + facet_wrap(~src) + 
    scale_color_discrete("NYC training\nhas 10+ obs")

dfPlot_v2 %>%
    left_join(select(tmpNYC, vapor_pressure_deficit, dewpoint_2m, inNYC), 
              by=c("vapor_pressure_deficit", "dewpoint_2m")
              ) %>%
    mutate(inNYC=ifelse(is.na(inNYC), FALSE, inNYC)) %>%
    group_by(src) %>%
    summarize(meanNYC=sum(n*inNYC)/sum(n), n=sum(n), nObs=n())
## # A tibble: 5 × 4
##   src     meanNYC     n  nObs
##   <chr>     <dbl> <int> <int>
## 1 Chicago   0.988  2592   335
## 2 Houston   0.936  2659   371
## 3 LA        0.802  2677   490
## 4 NYC       0.990  2664   361
## 5 Vegas     0.355  2537   747

Chicago and NYC are both very well-represented by the training data, while a majority of Las Vegas observations are largely or entirely absent from the training data